diff options
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 195 | ||||
| -rw-r--r-- | kernel/time/Makefile | 10 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 861 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 578 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 510 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 64 | ||||
| -rw-r--r-- | kernel/time/ntp.c | 675 | ||||
| -rw-r--r-- | kernel/time/ntp_internal.h | 12 | ||||
| -rw-r--r-- | kernel/time/posix-clock.c | 446 | ||||
| -rw-r--r-- | kernel/time/sched_clock.c | 217 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 106 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 544 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 257 | ||||
| -rw-r--r-- | kernel/time/tick-internal.h | 51 | ||||
| -rw-r--r-- | kernel/time/tick-oneshot.c | 82 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 837 | ||||
| -rw-r--r-- | kernel/time/timecompare.c | 192 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 1478 | ||||
| -rw-r--r-- | kernel/time/timekeeping_debug.c | 74 | ||||
| -rw-r--r-- | kernel/time/timekeeping_internal.h | 14 | ||||
| -rw-r--r-- | kernel/time/timer_list.c | 125 | ||||
| -rw-r--r-- | kernel/time/timer_stats.c | 16 | 
22 files changed, 5850 insertions, 1494 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f06a8a36564..f448513a45e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,17 +1,195 @@  #  # Timer subsystem related configuration options  # + +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG +	bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA +	bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL +	bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD +	bool + +# ktime_t scalar 64bit nsec representation +config KTIME_SCALAR +	bool + +# Old style timekeeping +config ARCH_USES_GETTIMEOFFSET +	bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS +	bool + +# Migration helper. Builds, but does not invoke +config GENERIC_CLOCKEVENTS_BUILD +	bool +	default y +	depends on GENERIC_CLOCKEVENTS + +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST +	bool + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST +	bool +	depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST +	bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE +	bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this.  config TICK_ONESHOT  	bool -config NO_HZ -	bool "Tickless System (Dynamic Ticks)" +config NO_HZ_COMMON +	bool  	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS  	select TICK_ONESHOT + +choice +	prompt "Timer tick handling" +	default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC +	bool "Periodic timer ticks (constant rate, no dynticks)" +	help +	  This option keeps the tick running periodically at a constant +	  rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE +	bool "Idle dynticks system (tickless idle)" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	select NO_HZ_COMMON +	help +	  This option enables a tickless idle system: timer interrupts +	  will only trigger on an as-needed basis when the system is idle. +	  This is usually interesting for energy saving. + +	  Most of the time you want to say Y here. + +config NO_HZ_FULL +	bool "Full dynticks system (tickless)" +	# NO_HZ_COMMON dependency +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	# We need at least one periodic CPU for timekeeping +	depends on SMP +	# RCU_USER_QS dependency +	depends on HAVE_CONTEXT_TRACKING +	# VIRT_CPU_ACCOUNTING_GEN dependency +	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN +	select NO_HZ_COMMON +	select RCU_USER_QS +	select RCU_NOCB_CPU +	select VIRT_CPU_ACCOUNTING_GEN +	select IRQ_WORK +	help +	 Adaptively try to shutdown the tick whenever possible, even when +	 the CPU is running tasks. Typically this requires running a single +	 task on the CPU. Chances for running tickless are maximized when +	 the task mostly runs in userspace and has few kernel activity. + +	 You need to fill up the nohz_full boot parameter with the +	 desired range of dynticks CPUs. + +	 This is implemented at the expense of some overhead in user <-> kernel +	 transitions: syscalls, exceptions and interrupts. Even when it's +	 dynamically off. + +	 Say N. + +endchoice + +config NO_HZ_FULL_ALL +       bool "Full dynticks system on all CPUs by default (except CPU 0)" +       depends on NO_HZ_FULL +       help +         If the user doesn't pass the nohz_full boot option to +	 define the range of full dynticks CPUs, consider that all +	 CPUs in the system are full dynticks by default. +	 Note the boot CPU will still be kept outside the range to +	 handle the timekeeping duty. + +config NO_HZ_FULL_SYSIDLE +	bool "Detect full-system idle state for full dynticks system" +	depends on NO_HZ_FULL +	default n +	help +	 At least one CPU must keep the scheduling-clock tick running for +	 timekeeping purposes whenever there is a non-idle CPU, where +	 "non-idle" also includes dynticks CPUs as long as they are +	 running non-idle tasks.  Because the underlying adaptive-tick +	 support cannot distinguish between all CPUs being idle and +	 all CPUs each running a single task in dynticks mode, the +	 underlying support simply ensures that there is always a CPU +	 handling the scheduling-clock tick, whether or not all CPUs +	 are idle.  This Kconfig option enables scalable detection of +	 the all-CPUs-idle state, thus allowing the scheduling-clock +	 tick to be disabled when all CPUs are idle.  Note that scalable +	 detection of the all-CPUs-idle state means that larger systems +	 will be slower to declare the all-CPUs-idle state. + +	 Say Y if you would like to help debug all-CPUs-idle detection. + +	 Say N if you are unsure. + +config NO_HZ_FULL_SYSIDLE_SMALL +	int "Number of CPUs above which large-system approach is used" +	depends on NO_HZ_FULL_SYSIDLE +	range 1 NR_CPUS +	default 8  	help -	  This option enables a tickless system: timer interrupts will -	  only trigger on an as-needed basis both when the system is -	  busy and when the system is idle. +	 The full-system idle detection mechanism takes a lazy approach +	 on large systems, as is required to attain decent scalability. +	 However, on smaller systems, scalability is not anywhere near as +	 large a concern as is energy efficiency.  The sysidle subsystem +	 therefore uses a fast but non-scalable algorithm for small +	 systems and a lazier but scalable algorithm for large systems. +	 This Kconfig parameter defines the number of CPUs in the largest +	 system that will be considered to be "small". + +	 The default value will be fine in most cases.	Battery-powered +	 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger +	 numbers of CPUs, and (3) are suffering from battery-lifetime +	 problems due to long sysidle latencies might wish to experiment +	 with larger values for this Kconfig parameter.  On the other +	 hand, they might be even better served by disabling NO_HZ_FULL +	 entirely, given that NO_HZ_FULL is intended for HPC and +	 real-time workloads that at present do not tend to be run on +	 battery-powered systems. + +	 Take the default if you are unsure. + +config NO_HZ +	bool "Old Idle dynticks config" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	help +	  This is the old config entry that enables dynticks idle. +	  We keep it around for a little while to enforce backward +	  compatibility with older config files.  config HIGH_RES_TIMERS  	bool "High Resolution Timer Support" @@ -22,8 +200,5 @@ config HIGH_RES_TIMERS  	  hardware is not capable then this option only increases  	  the size of the kernel image. -config GENERIC_CLOCKEVENTS_BUILD -	bool -	default y -	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR - +endmenu +endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06..57a413fd0eb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,8 +1,14 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timeconv.o posix-clock.o alarmtimer.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y						+= tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT)			+= tick-broadcast-hrtimer.o +endif +obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o  obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o  obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o  obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o +obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 00000000000..fe75444ae7e --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,861 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz <john.stultz@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock:		Lock for syncrhonized access to the base + * @timerqueue:		Timerqueue head managing the list of events + * @timer: 		hrtimer used to schedule events while running + * @gettime:		Function to read the time correlating to the base + * @base_clockid:	clockid for the base + */ +static struct alarm_base { +	spinlock_t		lock; +	struct timerqueue_head	timerqueue; +	ktime_t			(*gettime)(void); +	clockid_t		base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + +static struct wakeup_source *ws; + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer		rtctimer; +static struct rtc_device	*rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +struct rtc_device *alarmtimer_get_rtcdev(void) +{ +	unsigned long flags; +	struct rtc_device *ret; + +	spin_lock_irqsave(&rtcdev_lock, flags); +	ret = rtcdev; +	spin_unlock_irqrestore(&rtcdev_lock, flags); + +	return ret; +} + + +static int alarmtimer_rtc_add_device(struct device *dev, +				struct class_interface *class_intf) +{ +	unsigned long flags; +	struct rtc_device *rtc = to_rtc_device(dev); + +	if (rtcdev) +		return -EBUSY; + +	if (!rtc->ops->set_alarm) +		return -1; +	if (!device_may_wakeup(rtc->dev.parent)) +		return -1; + +	spin_lock_irqsave(&rtcdev_lock, flags); +	if (!rtcdev) { +		rtcdev = rtc; +		/* hold a reference so it doesn't go away */ +		get_device(dev); +	} +	spin_unlock_irqrestore(&rtcdev_lock, flags); +	return 0; +} + +static inline void alarmtimer_rtc_timer_init(void) +{ +	rtc_timer_init(&rtctimer, NULL, NULL); +} + +static struct class_interface alarmtimer_rtc_interface = { +	.add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ +	alarmtimer_rtc_interface.class = rtc_class; +	return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ +	class_interface_unregister(&alarmtimer_rtc_interface); +} +#else +struct rtc_device *alarmtimer_get_rtcdev(void) +{ +	return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } +#endif + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ +	if (alarm->state & ALARMTIMER_STATE_ENQUEUED) +		timerqueue_del(&base->timerqueue, &alarm->node); + +	timerqueue_add(&base->timerqueue, &alarm->node); +	alarm->state |= ALARMTIMER_STATE_ENQUEUED; +} + +/** + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) +{ +	if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) +		return; + +	timerqueue_del(&base->timerqueue, &alarm->node); +	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ +	struct alarm *alarm = container_of(timer, struct alarm, timer); +	struct alarm_base *base = &alarm_bases[alarm->type]; +	unsigned long flags; +	int ret = HRTIMER_NORESTART; +	int restart = ALARMTIMER_NORESTART; + +	spin_lock_irqsave(&base->lock, flags); +	alarmtimer_dequeue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); + +	if (alarm->function) +		restart = alarm->function(alarm, base->gettime()); + +	spin_lock_irqsave(&base->lock, flags); +	if (restart != ALARMTIMER_NORESTART) { +		hrtimer_set_expires(&alarm->timer, alarm->node.expires); +		alarmtimer_enqueue(base, alarm); +		ret = HRTIMER_RESTART; +	} +	spin_unlock_irqrestore(&base->lock, flags); + +	return ret; + +} + +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ +	struct rtc_time tm; +	ktime_t min, now; +	unsigned long flags; +	struct rtc_device *rtc; +	int i; +	int ret; + +	spin_lock_irqsave(&freezer_delta_lock, flags); +	min = freezer_delta; +	freezer_delta = ktime_set(0, 0); +	spin_unlock_irqrestore(&freezer_delta_lock, flags); + +	rtc = alarmtimer_get_rtcdev(); +	/* If we have no rtcdev, just return */ +	if (!rtc) +		return 0; + +	/* Find the soonest timer to expire*/ +	for (i = 0; i < ALARM_NUMTYPE; i++) { +		struct alarm_base *base = &alarm_bases[i]; +		struct timerqueue_node *next; +		ktime_t delta; + +		spin_lock_irqsave(&base->lock, flags); +		next = timerqueue_getnext(&base->timerqueue); +		spin_unlock_irqrestore(&base->lock, flags); +		if (!next) +			continue; +		delta = ktime_sub(next->expires, base->gettime()); +		if (!min.tv64 || (delta.tv64 < min.tv64)) +			min = delta; +	} +	if (min.tv64 == 0) +		return 0; + +	if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { +		__pm_wakeup_event(ws, 2 * MSEC_PER_SEC); +		return -EBUSY; +	} + +	/* Setup an rtc timer to fire that far in the future */ +	rtc_timer_cancel(rtc, &rtctimer); +	rtc_read_time(rtc, &tm); +	now = rtc_tm_to_ktime(tm); +	now = ktime_add(now, min); + +	/* Set alarm, if in the past reject suspend briefly to handle */ +	ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); +	if (ret < 0) +		__pm_wakeup_event(ws, MSEC_PER_SEC); +	return ret; +} +#else +static int alarmtimer_suspend(struct device *dev) +{ +	return 0; +} +#endif + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ +	ktime_t delta; +	unsigned long flags; +	struct alarm_base *base = &alarm_bases[type]; + +	delta = ktime_sub(absexp, base->gettime()); + +	spin_lock_irqsave(&freezer_delta_lock, flags); +	if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) +		freezer_delta = delta; +	spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, +		enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ +	timerqueue_init(&alarm->node); +	hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, +			HRTIMER_MODE_ABS); +	alarm->timer.function = alarmtimer_fired; +	alarm->function = function; +	alarm->type = type; +	alarm->state = ALARMTIMER_STATE_INACTIVE; +} +EXPORT_SYMBOL_GPL(alarm_init); + +/** + * alarm_start - Sets an absolute alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + */ +int alarm_start(struct alarm *alarm, ktime_t start) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&base->lock, flags); +	alarm->node.expires = start; +	alarmtimer_enqueue(base, alarm); +	ret = hrtimer_start(&alarm->timer, alarm->node.expires, +				HRTIMER_MODE_ABS); +	spin_unlock_irqrestore(&base->lock, flags); +	return ret; +} +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; + +	start = ktime_add(start, base->gettime()); +	return alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	unsigned long flags; + +	spin_lock_irqsave(&base->lock, flags); +	hrtimer_set_expires(&alarm->timer, alarm->node.expires); +	hrtimer_restart(&alarm->timer); +	alarmtimer_enqueue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); + +/** + * alarm_try_to_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running + */ +int alarm_try_to_cancel(struct alarm *alarm) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&base->lock, flags); +	ret = hrtimer_try_to_cancel(&alarm->timer); +	if (ret >= 0) +		alarmtimer_dequeue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); +	return ret; +} +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ +	for (;;) { +		int ret = alarm_try_to_cancel(alarm); +		if (ret >= 0) +			return ret; +		cpu_relax(); +	} +} +EXPORT_SYMBOL_GPL(alarm_cancel); + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ +	u64 overrun = 1; +	ktime_t delta; + +	delta = ktime_sub(now, alarm->node.expires); + +	if (delta.tv64 < 0) +		return 0; + +	if (unlikely(delta.tv64 >= interval.tv64)) { +		s64 incr = ktime_to_ns(interval); + +		overrun = ktime_divns(delta, incr); + +		alarm->node.expires = ktime_add_ns(alarm->node.expires, +							incr*overrun); + +		if (alarm->node.expires.tv64 > now.tv64) +			return overrun; +		/* +		 * This (and the ktime_add() below) is the +		 * correction for exact: +		 */ +		overrun++; +	} + +	alarm->node.expires = ktime_add(alarm->node.expires, interval); +	return overrun; +} +EXPORT_SYMBOL_GPL(alarm_forward); + +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; + +	return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); + + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ +	if (clockid == CLOCK_REALTIME_ALARM) +		return ALARM_REALTIME; +	if (clockid == CLOCK_BOOTTIME_ALARM) +		return ALARM_BOOTTIME; +	return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, +							ktime_t now) +{ +	struct k_itimer *ptr = container_of(alarm, struct k_itimer, +						it.alarm.alarmtimer); +	if (posix_timer_event(ptr, 0) != 0) +		ptr->it_overrun++; + +	/* Re-add periodic timers */ +	if (ptr->it.alarm.interval.tv64) { +		ptr->it_overrun += alarm_forward(alarm, now, +						ptr->it.alarm.interval); +		return ALARMTIMER_RESTART; +	} +	return ALARMTIMER_NORESTART; +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ +	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + +	if (!alarmtimer_get_rtcdev()) +		return -EINVAL; + +	return hrtimer_get_res(baseid, tp); +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +{ +	struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + +	if (!alarmtimer_get_rtcdev()) +		return -EINVAL; + +	*tp = ktime_to_timespec(base->gettime()); +	return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ +	enum  alarmtimer_type type; +	struct alarm_base *base; + +	if (!alarmtimer_get_rtcdev()) +		return -ENOTSUPP; + +	if (!capable(CAP_WAKE_ALARM)) +		return -EPERM; + +	type = clock2alarm(new_timer->it_clock); +	base = &alarm_bases[type]; +	alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); +	return 0; +} + +/** + * alarm_timer_get - posix timer_get interface + * @new_timer: k_itimer pointer + * @cur_setting: itimerspec data to fill + * + * Copies the itimerspec data out from the k_itimer + */ +static void alarm_timer_get(struct k_itimer *timr, +				struct itimerspec *cur_setting) +{ +	memset(cur_setting, 0, sizeof(struct itimerspec)); + +	cur_setting->it_interval = +			ktime_to_timespec(timr->it.alarm.interval); +	cur_setting->it_value = +		ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); +	return; +} + +/** + * alarm_timer_del - posix timer_del interface + * @timr: k_itimer pointer to be deleted + * + * Cancels any programmed alarms for the given timer. + */ +static int alarm_timer_del(struct k_itimer *timr) +{ +	if (!rtcdev) +		return -ENOTSUPP; + +	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) +		return TIMER_RETRY; + +	return 0; +} + +/** + * alarm_timer_set - posix timer_set interface + * @timr: k_itimer pointer to be deleted + * @flags: timer flags + * @new_setting: itimerspec to be used + * @old_setting: itimerspec being replaced + * + * Sets the timer to new_setting, and starts the timer. + */ +static int alarm_timer_set(struct k_itimer *timr, int flags, +				struct itimerspec *new_setting, +				struct itimerspec *old_setting) +{ +	ktime_t exp; + +	if (!rtcdev) +		return -ENOTSUPP; + +	if (flags & ~TIMER_ABSTIME) +		return -EINVAL; + +	if (old_setting) +		alarm_timer_get(timr, old_setting); + +	/* If the timer was already set, cancel it */ +	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) +		return TIMER_RETRY; + +	/* start the timer */ +	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); +	exp = timespec_to_ktime(new_setting->it_value); +	/* Convert (if necessary) to absolute time */ +	if (flags != TIMER_ABSTIME) { +		ktime_t now; + +		now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); +		exp = ktime_add(now, exp); +	} + +	alarm_start(&timr->it.alarm.alarmtimer, exp); +	return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, +								ktime_t now) +{ +	struct task_struct *task = (struct task_struct *)alarm->data; + +	alarm->data = NULL; +	if (task) +		wake_up_process(task); +	return ALARMTIMER_NORESTART; +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +{ +	alarm->data = (void *)current; +	do { +		set_current_state(TASK_INTERRUPTIBLE); +		alarm_start(alarm, absexp); +		if (likely(alarm->data)) +			schedule(); + +		alarm_cancel(alarm); +	} while (alarm->data && !signal_pending(current)); + +	__set_current_state(TASK_RUNNING); + +	return (alarm->data == NULL); +} + + +/** + * update_rmtp - Update remaining timespec value + * @exp: expiration time + * @type: timer type + * @rmtp: user pointer to remaining timepsec value + * + * Helper function that fills in rmtp value with time between + * now and the exp value + */ +static int update_rmtp(ktime_t exp, enum  alarmtimer_type type, +			struct timespec __user *rmtp) +{ +	struct timespec rmt; +	ktime_t rem; + +	rem = ktime_sub(exp, alarm_bases[type].gettime()); + +	if (rem.tv64 <= 0) +		return 0; +	rmt = ktime_to_timespec(rem); + +	if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) +		return -EFAULT; + +	return 1; + +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ +	enum  alarmtimer_type type = restart->nanosleep.clockid; +	ktime_t exp; +	struct timespec __user  *rmtp; +	struct alarm alarm; +	int ret = 0; + +	exp.tv64 = restart->nanosleep.expires; +	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + +	if (alarmtimer_do_nsleep(&alarm, exp)) +		goto out; + +	if (freezing(current)) +		alarmtimer_freezerset(exp, type); + +	rmtp = restart->nanosleep.rmtp; +	if (rmtp) { +		ret = update_rmtp(exp, type, rmtp); +		if (ret <= 0) +			goto out; +	} + + +	/* The other values in restart are already filled in */ +	ret = -ERESTART_RESTARTBLOCK; +out: +	return ret; +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, +		     struct timespec *tsreq, struct timespec __user *rmtp) +{ +	enum  alarmtimer_type type = clock2alarm(which_clock); +	struct alarm alarm; +	ktime_t exp; +	int ret = 0; +	struct restart_block *restart; + +	if (!alarmtimer_get_rtcdev()) +		return -ENOTSUPP; + +	if (flags & ~TIMER_ABSTIME) +		return -EINVAL; + +	if (!capable(CAP_WAKE_ALARM)) +		return -EPERM; + +	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + +	exp = timespec_to_ktime(*tsreq); +	/* Convert (if necessary) to absolute time */ +	if (flags != TIMER_ABSTIME) { +		ktime_t now = alarm_bases[type].gettime(); +		exp = ktime_add(now, exp); +	} + +	if (alarmtimer_do_nsleep(&alarm, exp)) +		goto out; + +	if (freezing(current)) +		alarmtimer_freezerset(exp, type); + +	/* abs timers don't set remaining time or restart */ +	if (flags == TIMER_ABSTIME) { +		ret = -ERESTARTNOHAND; +		goto out; +	} + +	if (rmtp) { +		ret = update_rmtp(exp, type, rmtp); +		if (ret <= 0) +			goto out; +	} + +	restart = ¤t_thread_info()->restart_block; +	restart->fn = alarm_timer_nsleep_restart; +	restart->nanosleep.clockid = type; +	restart->nanosleep.expires = exp.tv64; +	restart->nanosleep.rmtp = rmtp; +	ret = -ERESTART_RESTARTBLOCK; + +out: +	return ret; +} + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { +	.suspend = alarmtimer_suspend, +}; + +static struct platform_driver alarmtimer_driver = { +	.driver = { +		.name = "alarmtimer", +		.pm = &alarmtimer_pm_ops, +	} +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ +	struct platform_device *pdev; +	int error = 0; +	int i; +	struct k_clock alarm_clock = { +		.clock_getres	= alarm_clock_getres, +		.clock_get	= alarm_clock_get, +		.timer_create	= alarm_timer_create, +		.timer_set	= alarm_timer_set, +		.timer_del	= alarm_timer_del, +		.timer_get	= alarm_timer_get, +		.nsleep		= alarm_timer_nsleep, +	}; + +	alarmtimer_rtc_timer_init(); + +	posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); +	posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + +	/* Initialize alarm bases */ +	alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; +	alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; +	alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; +	alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; +	for (i = 0; i < ALARM_NUMTYPE; i++) { +		timerqueue_init_head(&alarm_bases[i].timerqueue); +		spin_lock_init(&alarm_bases[i].lock); +	} + +	error = alarmtimer_rtc_interface_setup(); +	if (error) +		return error; + +	error = platform_driver_register(&alarmtimer_driver); +	if (error) +		goto out_if; + +	pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); +	if (IS_ERR(pdev)) { +		error = PTR_ERR(pdev); +		goto out_drv; +	} +	ws = wakeup_source_register("alarmtimer"); +	return 0; + +out_drv: +	platform_driver_unregister(&alarmtimer_driver); +out_if: +	alarmtimer_rtc_interface_remove(); +	return error; +} +device_initcall(alarmtimer_init); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f..9c94c19f130 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,46 +15,82 @@  #include <linux/hrtimer.h>  #include <linux/init.h>  #include <linux/module.h> -#include <linux/notifier.h>  #include <linux/smp.h> -#include <linux/sysdev.h> -#include <linux/tick.h> +#include <linux/device.h>  #include "tick-internal.h"  /* The registered clock event devices */  static LIST_HEAD(clockevent_devices);  static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); -  /* Protection for the above */  static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch:	value to convert - * @evt:	pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +struct ce_unbind { +	struct clock_event_device *ce; +	int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, +			bool ismax)  {  	u64 clc = (u64) latch << evt->shift; +	u64 rnd;  	if (unlikely(!evt->mult)) {  		evt->mult = 1;  		WARN_ON(1);  	} +	rnd = (u64) evt->mult - 1; + +	/* +	 * Upper bound sanity check. If the backwards conversion is +	 * not equal latch, we know that the above shift overflowed. +	 */ +	if ((clc >> evt->shift) != (u64)latch) +		clc = ~0ULL; + +	/* +	 * Scaled math oddities: +	 * +	 * For mult <= (1 << shift) we can safely add mult - 1 to +	 * prevent integer rounding loss. So the backwards conversion +	 * from nsec to device ticks will be correct. +	 * +	 * For mult > (1 << shift), i.e. device frequency is > 1GHz we +	 * need to be careful. Adding mult - 1 will result in a value +	 * which when converted back to device ticks can be larger +	 * than latch by up to (mult - 1) >> shift. For the min_delta +	 * calculation we still want to apply this in order to stay +	 * above the minimum device ticks limit. For the upper limit +	 * we would end up with a latch value larger than the upper +	 * limit of the device, so we omit the add to stay below the +	 * device upper boundary. +	 * +	 * Also omit the add if it would overflow the u64 boundary. +	 */ +	if ((~0ULL - clc > rnd) && +	    (!ismax || evt->mult <= (1U << evt->shift))) +		clc += rnd;  	do_div(clc, evt->mult); -	if (clc < 1000) -		clc = 1000; -	if (clc > KTIME_MAX) -		clc = KTIME_MAX; -	return clc; +	/* Deltas less than 1usec are pointless noise */ +	return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch:	value to convert + * @evt:	pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ +	return cev_delta2ns(latch, evt, false);  }  EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -95,66 +131,145 @@ void clockevents_shutdown(struct clock_event_device *dev)  	dev->next_event.tv64 = KTIME_MAX;  } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev:       device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ +	/* Nothing to do if we already reached the limit */ +	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { +		printk_deferred(KERN_WARNING +				"CE: Reprogramming failure. Giving up\n"); +		dev->next_event.tv64 = KTIME_MAX; +		return -ETIME; +	} + +	if (dev->min_delta_ns < 5000) +		dev->min_delta_ns = 5000; +	else +		dev->min_delta_ns += dev->min_delta_ns >> 1; + +	if (dev->min_delta_ns > MIN_DELTA_LIMIT) +		dev->min_delta_ns = MIN_DELTA_LIMIT; + +	printk_deferred(KERN_WARNING +			"CE: %s increased min_delta_ns to %llu nsec\n", +			dev->name ? dev->name : "?", +			(unsigned long long) dev->min_delta_ns); +	return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev:	device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ +	unsigned long long clc; +	int64_t delta; +	int i; + +	for (i = 0;;) { +		delta = dev->min_delta_ns; +		dev->next_event = ktime_add_ns(ktime_get(), delta); + +		if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) +			return 0; + +		dev->retries++; +		clc = ((unsigned long long) delta * dev->mult) >> dev->shift; +		if (dev->set_next_event((unsigned long) clc, dev) == 0) +			return 0; + +		if (++i > 2) { +			/* +			 * We tried 3 times to program the device with the +			 * given min_delta_ns. Try to increase the minimum +			 * delta, if that fails as well get out of here. +			 */ +			if (clockevents_increase_min_delta(dev)) +				return -ETIME; +			i = 0; +		} +	} +} + +#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev:	device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ +	unsigned long long clc; +	int64_t delta; + +	delta = dev->min_delta_ns; +	dev->next_event = ktime_add_ns(ktime_get(), delta); + +	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) +		return 0; + +	dev->retries++; +	clc = ((unsigned long long) delta * dev->mult) >> dev->shift; +	return dev->set_next_event((unsigned long) clc, dev); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +  /**   * clockevents_program_event - Reprogram the clock event device. + * @dev:	device to program   * @expires:	absolute expiry time (monotonic clock) + * @force:	program minimum delay if expires can not be set   *   * Returns 0 on success, -ETIME when the event is in the past.   */  int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, -			      ktime_t now) +			      bool force)  {  	unsigned long long clc;  	int64_t delta; +	int rc;  	if (unlikely(expires.tv64 < 0)) {  		WARN_ON_ONCE(1);  		return -ETIME;  	} -	delta = ktime_to_ns(ktime_sub(expires, now)); - -	if (delta <= 0) -		return -ETIME; -  	dev->next_event = expires;  	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)  		return 0; -	if (delta > dev->max_delta_ns) -		delta = dev->max_delta_ns; -	if (delta < dev->min_delta_ns) -		delta = dev->min_delta_ns; - -	clc = delta * dev->mult; -	clc >>= dev->shift; - -	return dev->set_next_event((unsigned long) clc, dev); -} +	/* Shortcut for clockevent devices that can deal with ktime. */ +	if (dev->features & CLOCK_EVT_FEAT_KTIME) +		return dev->set_next_ktime(expires, dev); -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ -	unsigned long flags; -	int ret; +	delta = ktime_to_ns(ktime_sub(expires, ktime_get())); +	if (delta <= 0) +		return force ? clockevents_program_min_delta(dev) : -ETIME; -	raw_spin_lock_irqsave(&clockevents_lock, flags); -	ret = raw_notifier_chain_register(&clockevents_chain, nb); -	raw_spin_unlock_irqrestore(&clockevents_lock, flags); +	delta = min(delta, (int64_t) dev->max_delta_ns); +	delta = max(delta, (int64_t) dev->min_delta_ns); -	return ret; -} +	clc = ((unsigned long long) delta * dev->mult) >> dev->shift; +	rc = dev->set_next_event((unsigned long) clc, dev); -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ -	raw_notifier_call_chain(&clockevents_chain, reason, dev); +	return (rc && force) ? clockevents_program_min_delta(dev) : rc;  }  /* @@ -170,9 +285,93 @@ static void clockevents_notify_released(void)  				 struct clock_event_device, list);  		list_del(&dev->list);  		list_add(&dev->list, &clockevent_devices); -		clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); +		tick_check_new_device(dev); +	} +} + +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ +	struct clock_event_device *dev, *newdev = NULL; + +	list_for_each_entry(dev, &clockevent_devices, list) { +		if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) +			continue; + +		if (!tick_check_replacement(newdev, dev)) +			continue; + +		if (!try_module_get(dev->owner)) +			continue; + +		if (newdev) +			module_put(newdev->owner); +		newdev = dev; +	} +	if (newdev) { +		tick_install_replacement(newdev); +		list_del_init(&ced->list); +	} +	return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ +	/* Fast track. Device is unused */ +	if (ced->mode == CLOCK_EVT_MODE_UNUSED) { +		list_del_init(&ced->list); +		return 0;  	} + +	return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ +	struct ce_unbind *cu = arg; +	int res; + +	raw_spin_lock(&clockevents_lock); +	res = __clockevents_try_unbind(cu->ce, smp_processor_id()); +	if (res == -EAGAIN) +		res = clockevents_replace(cu->ce); +	cu->res = res; +	raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ +	struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + +	smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); +	return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ +	int ret; + +	mutex_lock(&clockevents_mutex); +	ret = clockevents_unbind(ced, cpu); +	mutex_unlock(&clockevents_mutex); +	return ret;  } +EXPORT_SYMBOL_GPL(clockevents_unbind);  /**   * clockevents_register_device - register a clock event device @@ -183,18 +382,103 @@ void clockevents_register_device(struct clock_event_device *dev)  	unsigned long flags;  	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); -	BUG_ON(!dev->cpumask); +	if (!dev->cpumask) { +		WARN_ON(num_possible_cpus() > 1); +		dev->cpumask = cpumask_of(smp_processor_id()); +	}  	raw_spin_lock_irqsave(&clockevents_lock, flags);  	list_add(&dev->list, &clockevent_devices); -	clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); +	tick_check_new_device(dev);  	clockevents_notify_released();  	raw_spin_unlock_irqrestore(&clockevents_lock, flags);  }  EXPORT_SYMBOL_GPL(clockevents_register_device); +void clockevents_config(struct clock_event_device *dev, u32 freq) +{ +	u64 sec; + +	if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) +		return; + +	/* +	 * Calculate the maximum number of seconds we can sleep. Limit +	 * to 10 minutes for hardware which can program more than +	 * 32bit ticks so we still get reasonable conversion values. +	 */ +	sec = dev->max_delta_ticks; +	do_div(sec, freq); +	if (!sec) +		sec = 1; +	else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) +		sec = 600; + +	clockevents_calc_mult_shift(dev, freq, sec); +	dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); +	dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev:	device to register + * @freq:	The clock frequency + * @min_delta:	The minimum clock ticks to program in oneshot mode + * @max_delta:	The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, +				     u32 freq, unsigned long min_delta, +				     unsigned long max_delta) +{ +	dev->min_delta_ticks = min_delta; +	dev->max_delta_ticks = max_delta; +	clockevents_config(dev, freq); +	clockevents_register_device(dev); +} +EXPORT_SYMBOL_GPL(clockevents_config_and_register); + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ +	clockevents_config(dev, freq); + +	if (dev->mode == CLOCK_EVT_MODE_ONESHOT) +		return clockevents_program_event(dev, dev->next_event, false); + +	if (dev->mode == CLOCK_EVT_MODE_PERIODIC) +		dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + +	return 0; +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev:	device to modify + * @freq:	new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ +	unsigned long flags; +	int ret; + +	local_irq_save(flags); +	ret = tick_broadcast_update_freq(dev, freq); +	if (ret == -ENODEV) +		ret = __clockevents_update_freq(dev, freq); +	local_irq_restore(flags); +	return ret; +} +  /*   * Noop handler when we shut down an event device   */ @@ -220,6 +504,7 @@ void clockevents_exchange_device(struct clock_event_device *old,  	 * released list and do a notify add later.  	 */  	if (old) { +		module_put(old->owner);  		clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);  		list_del(&old->list);  		list_add(&old->list, &clockevents_released); @@ -232,21 +517,72 @@ void clockevents_exchange_device(struct clock_event_device *old,  	local_irq_restore(flags);  } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ +	struct clock_event_device *dev; + +	list_for_each_entry_reverse(dev, &clockevent_devices, list) +		if (dev->suspend) +			dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ +	struct clock_event_device *dev; + +	list_for_each_entry(dev, &clockevent_devices, list) +		if (dev->resume) +			dev->resume(dev); +} +  #ifdef CONFIG_GENERIC_CLOCKEVENTS  /**   * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error   */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg)  {  	struct clock_event_device *dev, *tmp;  	unsigned long flags; -	int cpu; +	int cpu, ret = 0;  	raw_spin_lock_irqsave(&clockevents_lock, flags); -	clockevents_do_notify(reason, arg);  	switch (reason) { +	case CLOCK_EVT_NOTIFY_BROADCAST_ON: +	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: +	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: +		tick_broadcast_on_off(reason, arg); +		break; + +	case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: +	case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: +		ret = tick_broadcast_oneshot_control(reason); +		break; + +	case CLOCK_EVT_NOTIFY_CPU_DYING: +		tick_handover_do_timer(arg); +		break; + +	case CLOCK_EVT_NOTIFY_SUSPEND: +		tick_suspend(); +		tick_suspend_broadcast(); +		break; + +	case CLOCK_EVT_NOTIFY_RESUME: +		tick_resume(); +		break; +  	case CLOCK_EVT_NOTIFY_CPU_DEAD: +		tick_shutdown_broadcast_oneshot(arg); +		tick_shutdown_broadcast(arg); +		tick_shutdown(arg);  		/*  		 * Unregister the clock event devices which were  		 * released from the users in the notify chain. @@ -270,6 +606,126 @@ void clockevents_notify(unsigned long reason, void *arg)  		break;  	}  	raw_spin_unlock_irqrestore(&clockevents_lock, flags); +	return ret;  }  EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { +	.name		= "clockevents", +	.dev_name       = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, +					   struct device_attribute *attr, +					   char *buf) +{ +	struct tick_device *td; +	ssize_t count = 0; + +	raw_spin_lock_irq(&clockevents_lock); +	td = tick_get_tick_dev(dev); +	if (td && td->evtdev) +		count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); +	raw_spin_unlock_irq(&clockevents_lock); +	return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, +				     struct device_attribute *attr, +				     const char *buf, size_t count) +{ +	char name[CS_NAME_LEN]; +	ssize_t ret = sysfs_get_uname(buf, name, count); +	struct clock_event_device *ce; + +	if (ret < 0) +		return ret; + +	ret = -ENODEV; +	mutex_lock(&clockevents_mutex); +	raw_spin_lock_irq(&clockevents_lock); +	list_for_each_entry(ce, &clockevent_devices, list) { +		if (!strcmp(ce->name, name)) { +			ret = __clockevents_try_unbind(ce, dev->id); +			break; +		} +	} +	raw_spin_unlock_irq(&clockevents_lock); +	/* +	 * We hold clockevents_mutex, so ce can't go away +	 */ +	if (ret == -EAGAIN) +		ret = clockevents_unbind(ce, dev->id); +	mutex_unlock(&clockevents_mutex); +	return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { +	.init_name	= "broadcast", +	.id		= 0, +	.bus		= &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ +	return dev == &tick_bc_dev ? tick_get_broadcast_device() : +		&per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ +	int err = device_register(&tick_bc_dev); + +	if (!err) +		err = device_create_file(&tick_bc_dev, &dev_attr_current_device); +	return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ +	return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; }  #endif + +static int __init tick_init_sysfs(void) +{ +	int cpu; + +	for_each_possible_cpu(cpu) { +		struct device *dev = &per_cpu(tick_percpu_dev, cpu); +		int err; + +		dev->id = cpu; +		dev->bus = &clockevents_subsys; +		err = device_register(dev); +		if (!err) +			err = device_create_file(dev, &dev_attr_current_device); +		if (!err) +			err = device_create_file(dev, &dev_attr_unbind_device); +		if (err) +			return err; +	} +	return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ +	int err = subsys_system_register(&clockevents_subsys, NULL); + +	if (!err) +		err = tick_init_sysfs(); +	return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4..ba3e502c955 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,14 +23,16 @@   *   o Allow clocksource drivers to be unregistered   */ +#include <linux/device.h>  #include <linux/clocksource.h> -#include <linux/sysdev.h>  #include <linux/init.h>  #include <linux/module.h>  #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */  #include <linux/tick.h>  #include <linux/kthread.h> +#include "tick-internal.h" +  void timecounter_init(struct timecounter *tc,  		      const struct cyclecounter *cc,  		      u64 start_tstamp) @@ -113,7 +115,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);   * @shift:	pointer to shift variable   * @from:	frequency to convert from   * @to:		frequency to convert to - * @minsec:	guaranteed runtime conversion range in seconds + * @maxsec:	guaranteed runtime conversion range in seconds   *   * The function evaluates the shift/mult pair for the scaled math   * operations of clocksources and clockevents. @@ -122,7 +124,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);   * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock   * event @to is the counter frequency and @from is NSEC_PER_SEC.   * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in   * seconds which must be covered by the runtime conversion with the   * calculated mult and shift factors. This guarantees that no 64bit   * overflow happens when the input value of the conversion is @@ -131,7 +133,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);   * factors.   */  void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)  {  	u64 tmp;  	u32 sft, sftacc= 32; @@ -140,7 +142,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)  	 * Calculate the shift factor which is limiting the conversion  	 * range:  	 */ -	tmp = ((u64)minsec * from) >> 32; +	tmp = ((u64)maxsec * from) >> 32;  	while (tmp) {  		tmp >>=1;  		sftacc--; @@ -152,6 +154,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)  	 */  	for (sft = 32; sft > 0; sft--) {  		tmp = (u64) to << sft; +		tmp += from / 2;  		do_div(tmp, from);  		if ((tmp >> sftacc) == 0)  			break; @@ -173,19 +176,20 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)  static struct clocksource *curr_clocksource;  static LIST_HEAD(clocksource_list);  static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +static char override_name[CS_NAME_LEN];  static int finished_booting;  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG  static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void);  static LIST_HEAD(watchdog_list);  static struct clocksource *watchdog;  static struct timer_list watchdog_timer;  static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);  static DEFINE_SPINLOCK(watchdog_lock); -static cycle_t watchdog_last;  static int watchdog_running; +static atomic_t watchdog_reset_pending;  static int clocksource_watchdog_kthread(void *data);  static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,16 +251,13 @@ static void clocksource_watchdog(unsigned long data)  	struct clocksource *cs;  	cycle_t csnow, wdnow;  	int64_t wd_nsec, cs_nsec; -	int next_cpu; +	int next_cpu, reset_pending;  	spin_lock(&watchdog_lock);  	if (!watchdog_running)  		goto out; -	wdnow = watchdog->read(watchdog); -	wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, -				     watchdog->mult, watchdog->shift); -	watchdog_last = wdnow; +	reset_pending = atomic_read(&watchdog_reset_pending);  	list_for_each_entry(cs, &watchdog_list, wd_list) { @@ -267,20 +268,33 @@ static void clocksource_watchdog(unsigned long data)  			continue;  		} +		local_irq_disable();  		csnow = cs->read(cs); +		wdnow = watchdog->read(watchdog); +		local_irq_enable();  		/* Clocksource initialized ? */ -		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { +		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || +		    atomic_read(&watchdog_reset_pending)) {  			cs->flags |= CLOCK_SOURCE_WATCHDOG; -			cs->wd_last = csnow; +			cs->wd_last = wdnow; +			cs->cs_last = csnow;  			continue;  		} -		/* Check the deviation from the watchdog clocksource. */ -		cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & +		wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, +					     watchdog->mult, watchdog->shift); + +		cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &  					     cs->mask, cs->mult, cs->shift); -		cs->wd_last = csnow; -		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { +		cs->cs_last = csnow; +		cs->wd_last = wdnow; + +		if (atomic_read(&watchdog_reset_pending)) +			continue; + +		/* Check the deviation from the watchdog clocksource. */ +		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {  			clocksource_unstable(cs, cs_nsec - wd_nsec);  			continue;  		} @@ -288,17 +302,41 @@ static void clocksource_watchdog(unsigned long data)  		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&  		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&  		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { +			/* Mark it valid for high-res. */  			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + +			/* +			 * clocksource_done_booting() will sort it if +			 * finished_booting is not set yet. +			 */ +			if (!finished_booting) +				continue; +  			/* -			 * We just marked the clocksource as highres-capable, -			 * notify the rest of the system as well so that we -			 * transition into high-res mode: +			 * If this is not the current clocksource let +			 * the watchdog thread reselect it. Due to the +			 * change to high res this clocksource might +			 * be preferred now. If it is the current +			 * clocksource let the tick code know about +			 * that change.  			 */ -			tick_clock_notify(); +			if (cs != curr_clocksource) { +				cs->flags |= CLOCK_SOURCE_RESELECT; +				schedule_work(&watchdog_work); +			} else { +				tick_clock_notify(); +			}  		}  	}  	/* +	 * We only clear the watchdog_reset_pending, when we did a +	 * full cycle through all clocksources. +	 */ +	if (reset_pending) +		atomic_dec(&watchdog_reset_pending); + +	/*  	 * Cycle through CPUs to check if the CPUs stay synchronized  	 * to each other.  	 */ @@ -317,7 +355,6 @@ static inline void clocksource_start_watchdog(void)  		return;  	init_timer(&watchdog_timer);  	watchdog_timer.function = clocksource_watchdog; -	watchdog_last = watchdog->read(watchdog);  	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;  	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));  	watchdog_running = 1; @@ -341,23 +378,7 @@ static inline void clocksource_reset_watchdog(void)  static void clocksource_resume_watchdog(void)  { -	unsigned long flags; - -	/* -	 * We use trylock here to avoid a potential dead lock when -	 * kgdb calls this code after the kernel has been stopped with -	 * watchdog_lock held. When watchdog_lock is held we just -	 * return and accept, that the watchdog might trigger and mark -	 * the monitored clock source (usually TSC) unstable. -	 * -	 * This does not affect the other caller clocksource_resume() -	 * because at this point the kernel is UP, interrupts are -	 * disabled and nothing can hold watchdog_lock. -	 */ -	if (!spin_trylock_irqsave(&watchdog_lock, flags)) -		return; -	clocksource_reset_watchdog(); -	spin_unlock_irqrestore(&watchdog_lock, flags); +	atomic_inc(&watchdog_reset_pending);  }  static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -387,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)  static void clocksource_dequeue_watchdog(struct clocksource *cs)  { -	struct clocksource *tmp;  	unsigned long flags;  	spin_lock_irqsave(&watchdog_lock, flags); -	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { -		/* cs is a watched clocksource. */ -		list_del_init(&cs->wd_list); -	} else if (cs == watchdog) { -		/* Reset watchdog cycles */ -		clocksource_reset_watchdog(); -		/* Current watchdog is removed. Find an alternative. */ -		watchdog = NULL; -		list_for_each_entry(tmp, &clocksource_list, list) { -			if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) -				continue; -			if (!watchdog || tmp->rating > watchdog->rating) -				watchdog = tmp; +	if (cs != watchdog) { +		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { +			/* cs is a watched clocksource. */ +			list_del_init(&cs->wd_list); +			/* Check if the watchdog timer needs to be stopped. */ +			clocksource_stop_watchdog();  		}  	} -	cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -	/* Check if the watchdog timer needs to be stopped. */ -	clocksource_stop_watchdog();  	spin_unlock_irqrestore(&watchdog_lock, flags);  } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void)  {  	struct clocksource *cs, *tmp;  	unsigned long flags;  	LIST_HEAD(unstable); +	int select = 0; -	mutex_lock(&clocksource_mutex);  	spin_lock_irqsave(&watchdog_lock, flags); -	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) +	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {  		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {  			list_del_init(&cs->wd_list);  			list_add(&cs->wd_list, &unstable); +			select = 1;  		} +		if (cs->flags & CLOCK_SOURCE_RESELECT) { +			cs->flags &= ~CLOCK_SOURCE_RESELECT; +			select = 1; +		} +	}  	/* Check if the watchdog timer needs to be stopped. */  	clocksource_stop_watchdog();  	spin_unlock_irqrestore(&watchdog_lock, flags); @@ -434,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)  		list_del_init(&cs->wd_list);  		__clocksource_change_rating(cs, 0);  	} +	return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ +	mutex_lock(&clocksource_mutex); +	if (__clocksource_watchdog_kthread()) +		clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0;  } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ +	return cs == watchdog; +} +  #else /* CONFIG_CLOCKSOURCE_WATCHDOG */  static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -448,7 +477,9 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)  static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }  static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { }  #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -491,66 +522,118 @@ void clocksource_touch_watchdog(void)  }  /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred + * clocksource_max_adjustment- Returns max adjustment amount   * @cs:         Pointer to clocksource   *   */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ +	u64 ret; +	/* +	 * We won't try to correct for more than 11% adjustments (110,000 ppm), +	 */ +	ret = (u64)cs->mult * 11; +	do_div(ret,100); +	return (u32)ret; +} + +/** + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult:	cycle to nanosecond multiplier + * @shift:	cycle to nanosecond divisor (power of two) + * @maxadj:	maximum adjustment value to mult (~11%) + * @mask:	bitmask for two's complement subtraction of non 64 bit counters + */ +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)  {  	u64 max_nsecs, max_cycles;  	/*  	 * Calculate the maximum number of cycles that we can pass to the  	 * cyc2ns function without overflowing a 64-bit signed result. The -	 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which -	 * is equivalent to the below. -	 * max_cycles < (2^63)/cs->mult -	 * max_cycles < 2^(log2((2^63)/cs->mult)) -	 * max_cycles < 2^(log2(2^63) - log2(cs->mult)) -	 * max_cycles < 2^(63 - log2(cs->mult)) -	 * max_cycles < 1 << (63 - log2(cs->mult)) +	 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) +	 * which is equivalent to the below. +	 * max_cycles < (2^63)/(mult + maxadj) +	 * max_cycles < 2^(log2((2^63)/(mult + maxadj))) +	 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) +	 * max_cycles < 2^(63 - log2(mult + maxadj)) +	 * max_cycles < 1 << (63 - log2(mult + maxadj))  	 * Please note that we add 1 to the result of the log2 to account for  	 * any rounding errors, ensure the above inequality is satisfied and  	 * no overflow will occur.  	 */ -	max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); +	max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));  	/*  	 * The actual maximum number of cycles we can defer the clocksource is -	 * determined by the minimum of max_cycles and cs->mask. +	 * determined by the minimum of max_cycles and mask. +	 * Note: Here we subtract the maxadj to make sure we don't sleep for +	 * too long if there's a large negative adjustment.  	 */ -	max_cycles = min_t(u64, max_cycles, (u64) cs->mask); -	max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); +	max_cycles = min(max_cycles, mask); +	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + +	return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs:         Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ +	u64 max_nsecs; +	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, +					  cs->mask);  	/*  	 * To ensure that the clocksource does not wrap whilst we are idle,  	 * limit the time the clocksource can be deferred by 12.5%. Please  	 * note a margin of 12.5% is used because this can be computed with  	 * a shift, versus say 10% which would require division.  	 */ -	return max_nsecs - (max_nsecs >> 5); +	return max_nsecs - (max_nsecs >> 3);  }  #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)  { -	struct clocksource *best, *cs; +	struct clocksource *cs;  	if (!finished_booting || list_empty(&clocksource_list)) +		return NULL; + +	/* +	 * We pick the clocksource with the highest rating. If oneshot +	 * mode is active, we pick the highres valid clocksource with +	 * the best rating. +	 */ +	list_for_each_entry(cs, &clocksource_list, list) { +		if (skipcur && cs == curr_clocksource) +			continue; +		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) +			continue; +		return cs; +	} +	return NULL; +} + +static void __clocksource_select(bool skipcur) +{ +	bool oneshot = tick_oneshot_mode_active(); +	struct clocksource *best, *cs; + +	/* Find the best suitable clocksource */ +	best = clocksource_find_best(oneshot, skipcur); +	if (!best)  		return; -	/* First clocksource on the list has the best rating. */ -	best = list_first_entry(&clocksource_list, struct clocksource, list); +  	/* Check for the override clocksource. */  	list_for_each_entry(cs, &clocksource_list, list) { +		if (skipcur && cs == curr_clocksource) +			continue;  		if (strcmp(cs->name, override_name) != 0)  			continue;  		/* @@ -558,8 +641,7 @@ static void clocksource_select(void)  		 * capable clocksource if the tick code is in oneshot  		 * mode (highres or nohz)  		 */ -		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && -		    tick_oneshot_mode_active()) { +		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {  			/* Override clocksource cannot be used. */  			printk(KERN_WARNING "Override clocksource %s is not "  			       "HRT compatible. Cannot switch while in " @@ -570,16 +652,35 @@ static void clocksource_select(void)  			best = cs;  		break;  	} -	if (curr_clocksource != best) { -		printk(KERN_INFO "Switching to clocksource %s\n", best->name); + +	if (curr_clocksource != best && !timekeeping_notify(best)) { +		pr_info("Switched to clocksource %s\n", best->name);  		curr_clocksource = best; -		timekeeping_notify(curr_clocksource);  	}  } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ +	return __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ +	return __clocksource_select(true); +} +  #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */  static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { }  #endif @@ -594,16 +695,11 @@ static int __init clocksource_done_booting(void)  {  	mutex_lock(&clocksource_mutex);  	curr_clocksource = clocksource_default_clock(); -	mutex_unlock(&clocksource_mutex); -  	finished_booting = 1; -  	/*  	 * Run the watchdog first to eliminate unstable clock sources  	 */ -	clocksource_watchdog_kthread(NULL); - -	mutex_lock(&clocksource_mutex); +	__clocksource_watchdog_kthread();  	clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0; @@ -625,22 +721,9 @@ static void clocksource_enqueue(struct clocksource *cs)  	list_add(&cs->list, entry);  } - -/* - * Maximum time we expect to go between ticks. This includes idle - * tickless time. It provides the trade off between selecting a - * mult/shift pair that is very precise but can only handle a short - * period of time, vs. a mult/shift pair that can handle long periods - * of time but isn't as precise. - * - * This is a subsystem constant, and actual hardware limitations - * may override it (ie: clocksources that wrap every 3 seconds). - */ -#define MAX_UPDATE_LENGTH 5 /* Seconds */ -  /**   * __clocksource_updatefreq_scale - Used update clocksource with new freq - * @t:		clocksource to be registered + * @cs:		clocksource to be registered   * @scale:	Scale factor multiplied against freq to get clocksource hz   * @freq:	clocksource frequency (cycles per second) divided by scale   * @@ -651,22 +734,48 @@ static void clocksource_enqueue(struct clocksource *cs)   */  void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)  { +	u64 sec;  	/* -	 * Ideally we want to use  some of the limits used in -	 * clocksource_max_deferment, to provide a more informed -	 * MAX_UPDATE_LENGTH. But for now this just gets the -	 * register interface working properly. +	 * Calc the maximum number of seconds which we can run before +	 * wrapping around. For clocksources which have a mask > 32bit +	 * we need to limit the max sleep time to have a good +	 * conversion precision. 10 minutes is still a reasonable +	 * amount. That results in a shift value of 24 for a +	 * clocksource with mask >= 40bit and f >= 4GHz. That maps to +	 * ~ 0.06ppm granularity for NTP. We apply the same 12.5% +	 * margin as we do in clocksource_max_deferment()  	 */ +	sec = (cs->mask - (cs->mask >> 3)); +	do_div(sec, freq); +	do_div(sec, scale); +	if (!sec) +		sec = 1; +	else if (sec > 600 && cs->mask > UINT_MAX) +		sec = 600; +  	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, -				      NSEC_PER_SEC/scale, -				      MAX_UPDATE_LENGTH*scale); +			       NSEC_PER_SEC / scale, sec * scale); + +	/* +	 * for clocksources that have large mults, to avoid overflow. +	 * Since mult may be adjusted by ntp, add an safety extra margin +	 * +	 */ +	cs->maxadj = clocksource_max_adjustment(cs); +	while ((cs->mult + cs->maxadj < cs->mult) +		|| (cs->mult - cs->maxadj > cs->mult)) { +		cs->mult >>= 1; +		cs->shift--; +		cs->maxadj = clocksource_max_adjustment(cs); +	} +  	cs->max_idle_ns = clocksource_max_deferment(cs);  }  EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);  /**   * __clocksource_register_scale - Used to install new clocksources - * @t:		clocksource to be registered + * @cs:		clocksource to be registered   * @scale:	Scale factor multiplied against freq to get clocksource hz   * @freq:	clocksource frequency (cycles per second) divided by scale   * @@ -678,14 +787,14 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);  int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)  { -	/* Intialize mult/shift and max_idle_ns */ +	/* Initialize mult/shift and max_idle_ns */  	__clocksource_updatefreq_scale(cs, scale, freq);  	/* Add clocksource to the clcoksource list */  	mutex_lock(&clocksource_mutex);  	clocksource_enqueue(cs); -	clocksource_select();  	clocksource_enqueue_watchdog(cs); +	clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0;  } @@ -694,19 +803,25 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);  /**   * clocksource_register - Used to install new clocksources - * @t:		clocksource to be registered + * @cs:		clocksource to be registered   *   * Returns -EBUSY if registration fails, zero otherwise.   */  int clocksource_register(struct clocksource *cs)  { +	/* calculate max adjustment for given mult/shift */ +	cs->maxadj = clocksource_max_adjustment(cs); +	WARN_ONCE(cs->mult + cs->maxadj < cs->mult, +		"Clocksource %s might overflow on 11%% adjustment\n", +		cs->name); +  	/* calculate max idle time permitted for this clocksource */  	cs->max_idle_ns = clocksource_max_deferment(cs);  	mutex_lock(&clocksource_mutex);  	clocksource_enqueue(cs); -	clocksource_select();  	clocksource_enqueue_watchdog(cs); +	clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0;  } @@ -717,30 +832,58 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)  	list_del(&cs->list);  	cs->rating = rating;  	clocksource_enqueue(cs); -	clocksource_select();  }  /**   * clocksource_change_rating - Change the rating of a registered clocksource + * @cs:		clocksource to be changed + * @rating:	new rating   */  void clocksource_change_rating(struct clocksource *cs, int rating)  {  	mutex_lock(&clocksource_mutex);  	__clocksource_change_rating(cs, rating); +	clocksource_select();  	mutex_unlock(&clocksource_mutex);  }  EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ +	/* +	 * I really can't convince myself to support this on hardware +	 * designed by lobotomized monkeys. +	 */ +	if (clocksource_is_watchdog(cs)) +		return -EBUSY; + +	if (cs == curr_clocksource) { +		/* Select and try to install a replacement clock source */ +		clocksource_select_fallback(); +		if (curr_clocksource == cs) +			return -EBUSY; +	} +	clocksource_dequeue_watchdog(cs); +	list_del_init(&cs->list); +	return 0; +} +  /**   * clocksource_unregister - remove a registered clocksource + * @cs:	clocksource to be unregistered   */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs)  { +	int ret = 0; +  	mutex_lock(&clocksource_mutex); -	clocksource_dequeue_watchdog(cs); -	list_del(&cs->list); -	clocksource_select(); +	if (!list_empty(&cs->list)) +		ret = clocksource_unbind(cs);  	mutex_unlock(&clocksource_mutex); +	return ret;  }  EXPORT_SYMBOL(clocksource_unregister); @@ -748,13 +891,14 @@ EXPORT_SYMBOL(clocksource_unregister);  /**   * sysfs_show_current_clocksources - sysfs interface for current clocksource   * @dev:	unused + * @attr:	unused   * @buf:	char buffer to be filled with clocksource list   *   * Provides sysfs interface for listing current clocksource.   */  static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, -				struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, +				struct device_attribute *attr, char *buf)  {  	ssize_t count = 0; @@ -765,35 +909,44 @@ sysfs_show_current_clocksources(struct sys_device *dev,  	return count;  } +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ +	size_t ret = cnt; + +	/* strings from sysfs write are not 0 terminated! */ +	if (!cnt || cnt >= CS_NAME_LEN) +		return -EINVAL; + +	/* strip of \n: */ +	if (buf[cnt-1] == '\n') +		cnt--; +	if (cnt > 0) +		memcpy(dst, buf, cnt); +	dst[cnt] = 0; +	return ret; +} +  /**   * sysfs_override_clocksource - interface for manually overriding clocksource   * @dev:	unused + * @attr:	unused   * @buf:	name of override clocksource   * @count:	length of buffer   *   * Takes input from sysfs interface for manually overriding the default   * clocksource selection.   */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, -					  struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, +					  struct device_attribute *attr,  					  const char *buf, size_t count)  { -	size_t ret = count; - -	/* strings from sysfs write are not 0 terminated! */ -	if (count >= sizeof(override_name)) -		return -EINVAL; - -	/* strip of \n: */ -	if (buf[count-1] == '\n') -		count--; +	ssize_t ret;  	mutex_lock(&clocksource_mutex); -	if (count > 0) -		memcpy(override_name, buf, count); -	override_name[count] = 0; -	clocksource_select(); +	ret = sysfs_get_uname(buf, override_name, count); +	if (ret >= 0) +		clocksource_select();  	mutex_unlock(&clocksource_mutex); @@ -801,15 +954,50 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,  }  /** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev:	unused + * @attr:	unused + * @buf:	unused + * @count:	length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, +					struct device_attribute *attr, +					const char *buf, size_t count) +{ +	struct clocksource *cs; +	char name[CS_NAME_LEN]; +	ssize_t ret; + +	ret = sysfs_get_uname(buf, name, count); +	if (ret < 0) +		return ret; + +	ret = -ENODEV; +	mutex_lock(&clocksource_mutex); +	list_for_each_entry(cs, &clocksource_list, list) { +		if (strcmp(cs->name, name)) +			continue; +		ret = clocksource_unbind(cs); +		break; +	} +	mutex_unlock(&clocksource_mutex); + +	return ret ? ret : count; +} + +/**   * sysfs_show_available_clocksources - sysfs interface for listing clocksource   * @dev:	unused + * @attr:	unused   * @buf:	char buffer to be filled with clocksource list   *   * Provides sysfs interface for listing registered clocksources   */  static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, -				  struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, +				  struct device_attribute *attr,  				  char *buf)  {  	struct clocksource *src; @@ -838,35 +1026,41 @@ sysfs_show_available_clocksources(struct sys_device *dev,  /*   * Sysfs setup bits:   */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,  		   sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + +static DEVICE_ATTR(available_clocksource, 0444,  		   sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = {  	.name = "clocksource", +	.dev_name = "clocksource",  }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = {  	.id	= 0, -	.cls	= &clocksource_sysclass, +	.bus	= &clocksource_subsys,  };  static int __init init_clocksource_sysfs(void)  { -	int error = sysdev_class_register(&clocksource_sysclass); +	int error = subsys_system_register(&clocksource_subsys, NULL);  	if (!error) -		error = sysdev_register(&device_clocksource); +		error = device_register(&device_clocksource);  	if (!error) -		error = sysdev_create_file( +		error = device_create_file(  				&device_clocksource, -				&attr_current_clocksource); +				&dev_attr_current_clocksource); +	if (!error) +		error = device_create_file(&device_clocksource, +					   &dev_attr_unbind_clocksource);  	if (!error) -		error = sysdev_create_file( +		error = device_create_file(  				&device_clocksource, -				&attr_available_clocksource); +				&dev_attr_available_clocksource);  	return error;  } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a845690..a6a5bf53e86 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,8 +22,11 @@  ************************************************************************/  #include <linux/clocksource.h>  #include <linux/jiffies.h> +#include <linux/module.h>  #include <linux/init.h> +#include "tick-internal.h" +  /* The Jiffies based clocksource is the lowest common   * denominator clock source which should function on   * all systems. It has the same coarse resolution as @@ -31,10 +34,10 @@   * inaccuracies caused by missed or lost timer   * interrupts and the inability for the timer   * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended   * for "tick-less" systems.   */ -#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY	((NSEC_PER_SEC+HZ/2)/HZ)  /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier   * conversion, the .shift value could be zero. However @@ -48,14 +51,20 @@   * HZ shrinks, so values greater than 8 overflow 32bits when   * HZ=100.   */ +#if HZ < 34 +#define JIFFIES_SHIFT	6 +#elif HZ < 67 +#define JIFFIES_SHIFT	7 +#else  #define JIFFIES_SHIFT	8 +#endif  static cycle_t jiffies_read(struct clocksource *cs)  {  	return (cycle_t) jiffies;  } -struct clocksource clocksource_jiffies = { +static struct clocksource clocksource_jiffies = {  	.name		= "jiffies",  	.rating		= 1, /* lowest valid rating*/  	.read		= jiffies_read, @@ -64,6 +73,25 @@ struct clocksource clocksource_jiffies = {  	.shift		= JIFFIES_SHIFT,  }; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ +	unsigned long seq; +	u64 ret; + +	do { +		seq = read_seqbegin(&jiffies_lock); +		ret = jiffies_64; +	} while (read_seqretry(&jiffies_lock, seq)); +	return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); +  static int __init init_jiffies_clocksource(void)  {  	return clocksource_register(&clocksource_jiffies); @@ -75,3 +103,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)  {  	return &clocksource_jiffies;  } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ +	u64 nsec_per_tick, shift_hz; +	long cycles_per_tick; + + + +	refined_jiffies = clocksource_jiffies; +	refined_jiffies.name = "refined-jiffies"; +	refined_jiffies.rating++; + +	/* Calc cycles per tick */ +	cycles_per_tick = (cycles_per_second + HZ/2)/HZ; +	/* shift_hz stores hz<<8 for extra accuracy */ +	shift_hz = (u64)cycles_per_second << 8; +	shift_hz += cycles_per_tick/2; +	do_div(shift_hz, cycles_per_tick); +	/* Calculate nsec_per_tick using shift_hz */ +	nsec_per_tick = (u64)NSEC_PER_SEC << 8; +	nsec_per_tick += (u32)shift_hz/2; +	do_div(nsec_per_tick, (u32)shift_hz); + +	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + +	clocksource_register(&refined_jiffies); +	return 0; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538..33db43a3951 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,22 +14,28 @@  #include <linux/timex.h>  #include <linux/time.h>  #include <linux/mm.h> +#include <linux/module.h> +#include <linux/rtc.h> + +#include "tick-internal.h" +#include "ntp_internal.h"  /*   * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks.   */ +  /* USER_HZ period (usecs): */  unsigned long			tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */  unsigned long			tick_nsec; -u64				tick_length; +static u64			tick_length;  static u64			tick_length_base; -static struct hrtimer		leap_timer; -  #define MAX_TICKADJ		500LL		/* usecs */  #define MAX_TICKADJ_SCALED \  	(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -46,10 +52,7 @@ static struct hrtimer		leap_timer;  static int			time_state = TIME_OK;  /* clock status bits:							*/ -int				time_status = STA_UNSYNC; - -/* TAI offset (secs):							*/ -static long			time_tai; +static int			time_status = STA_UNSYNC;  /* time adjustment (nsecs):						*/  static s64			time_offset; @@ -74,6 +77,169 @@ static long			time_adjust;  /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/  static s64			ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID	10	/* PPS signal watchdog max (s) */ +#define PPS_POPCORN	4	/* popcorn spike threshold (shift) */ +#define PPS_INTMIN	2	/* min freq interval (s) (shift) */ +#define PPS_INTMAX	8	/* max freq interval (s) (shift) */ +#define PPS_INTCOUNT	4	/* number of consecutive good intervals to +				   increase pps_shift or consecutive bad +				   intervals to decrease it */ +#define PPS_MAXWANDER	100000	/* max PPS freq wander (ns/s) */ + +static int pps_valid;		/* signal watchdog counter */ +static long pps_tf[3];		/* phase median filter */ +static long pps_jitter;		/* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift;		/* current interval duration (s) (shift) */ +static int pps_intcnt;		/* interval counter */ +static s64 pps_freq;		/* frequency offset (scaled ns/s) */ +static long pps_stabil;		/* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt;		/* calibration intervals */ +static long pps_jitcnt;		/* jitter limit exceeded */ +static long pps_stbcnt;		/* stability limit exceeded */ +static long pps_errcnt;		/* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ +	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) +		return offset; +	else +		return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ +	/* the PPS calibration interval may end +	   surprisingly early */ +	pps_shift = PPS_INTMIN; +	pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + */ +static inline void pps_clear(void) +{ +	pps_reset_freq_interval(); +	pps_tf[0] = 0; +	pps_tf[1] = 0; +	pps_tf[2] = 0; +	pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; +	pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + */ +static inline void pps_dec_valid(void) +{ +	if (pps_valid > 0) +		pps_valid--; +	else { +		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | +				 STA_PPSWANDER | STA_PPSERROR); +		pps_clear(); +	} +} + +static inline void pps_set_freq(s64 freq) +{ +	pps_freq = freq; +} + +static inline int is_error_status(int status) +{ +	return (status & (STA_UNSYNC|STA_CLOCKERR)) +		/* PPS signal lost when either PPS time or +		 * PPS frequency synchronization requested +		 */ +		|| ((status & (STA_PPSFREQ|STA_PPSTIME)) +			&& !(status & STA_PPSSIGNAL)) +		/* PPS jitter exceeded when +		 * PPS time synchronization requested */ +		|| ((status & (STA_PPSTIME|STA_PPSJITTER)) +			== (STA_PPSTIME|STA_PPSJITTER)) +		/* PPS wander exceeded or calibration error when +		 * PPS frequency synchronization requested +		 */ +		|| ((status & STA_PPSFREQ) +			&& (status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ +	txc->ppsfreq	   = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * +					 PPM_SCALE_INV, NTP_SCALE_SHIFT); +	txc->jitter	   = pps_jitter; +	if (!(time_status & STA_NANO)) +		txc->jitter /= NSEC_PER_USEC; +	txc->shift	   = pps_shift; +	txc->stabil	   = pps_stabil; +	txc->jitcnt	   = pps_jitcnt; +	txc->calcnt	   = pps_calcnt; +	txc->errcnt	   = pps_errcnt; +	txc->stbcnt	   = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ +	return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ +	return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ +	/* PPS is not implemented, so these are zero */ +	txc->ppsfreq	   = 0; +	txc->jitter	   = 0; +	txc->shift	   = 0; +	txc->stabil	   = 0; +	txc->jitcnt	   = 0; +	txc->calcnt	   = 0; +	txc->errcnt	   = 0; +	txc->stbcnt	   = 0; +} + +#endif /* CONFIG_NTP_PPS */ + + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ +	return !(time_status & STA_UNSYNC); +} + +  /*   * NTP methods:   */ @@ -116,7 +282,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)  	time_status |= STA_MODE; -	return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +	return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);  }  static void ntp_update_offset(long offset) @@ -171,8 +337,6 @@ static void ntp_update_offset(long offset)  /**   * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock   */  void ntp_clear(void)  { @@ -185,63 +349,75 @@ void ntp_clear(void)  	tick_length	= tick_length_base;  	time_offset	= 0; + +	/* Clear PPS state variables */ +	pps_clear(); +} + + +u64 ntp_tick_length(void) +{ +	return tick_length;  } +  /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset   */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs)  { -	enum hrtimer_restart res = HRTIMER_NORESTART; - -	write_seqlock(&xtime_lock); +	s64 delta; +	int leap = 0; +	/* +	 * Leap second processing. If in leap-insert state at the end of the +	 * day, the system clock is set back one second; if in leap-delete +	 * state, the system clock is set ahead one second. +	 */  	switch (time_state) {  	case TIME_OK: +		if (time_status & STA_INS) +			time_state = TIME_INS; +		else if (time_status & STA_DEL) +			time_state = TIME_DEL;  		break;  	case TIME_INS: -		timekeeping_leap_insert(-1); -		time_state = TIME_OOP; -		printk(KERN_NOTICE -			"Clock: inserting leap second 23:59:60 UTC\n"); -		hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); -		res = HRTIMER_RESTART; +		if (!(time_status & STA_INS)) +			time_state = TIME_OK; +		else if (secs % 86400 == 0) { +			leap = -1; +			time_state = TIME_OOP; +			printk(KERN_NOTICE +				"Clock: inserting leap second 23:59:60 UTC\n"); +		}  		break;  	case TIME_DEL: -		timekeeping_leap_insert(1); -		time_tai--; -		time_state = TIME_WAIT; -		printk(KERN_NOTICE -			"Clock: deleting leap second 23:59:59 UTC\n"); +		if (!(time_status & STA_DEL)) +			time_state = TIME_OK; +		else if ((secs + 1) % 86400 == 0) { +			leap = 1; +			time_state = TIME_WAIT; +			printk(KERN_NOTICE +				"Clock: deleting leap second 23:59:59 UTC\n"); +		}  		break;  	case TIME_OOP: -		time_tai++;  		time_state = TIME_WAIT; -		/* fall through */ +		break; +  	case TIME_WAIT:  		if (!(time_status & (STA_INS | STA_DEL)))  			time_state = TIME_OK;  		break;  	} -	write_sequnlock(&xtime_lock); - -	return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ -	s64 delta;  	/* Bump the maxerror field */  	time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -250,41 +426,40 @@ void second_overflow(void)  		time_status |= STA_UNSYNC;  	} -	/* -	 * Compute the phase adjustment for the next second. The offset is -	 * reduced by a fixed factor times the time constant. -	 */ +	/* Compute the phase adjustment for the next second */  	tick_length	 = tick_length_base; -	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant); +	delta		 = ntp_offset_chunk(time_offset);  	time_offset	-= delta;  	tick_length	+= delta; +	/* Check PPS signal */ +	pps_dec_valid(); +  	if (!time_adjust) -		return; +		goto out;  	if (time_adjust > MAX_TICKADJ) {  		time_adjust -= MAX_TICKADJ;  		tick_length += MAX_TICKADJ_SCALED; -		return; +		goto out;  	}  	if (time_adjust < -MAX_TICKADJ) {  		time_adjust += MAX_TICKADJ;  		tick_length -= MAX_TICKADJ_SCALED; -		return; +		goto out;  	}  	tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)  							 << NTP_SCALE_SHIFT;  	time_adjust = 0; -} - -#ifdef CONFIG_GENERIC_CMOS_UPDATE -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock  __read_mostly; +out: +	return leap; +} +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)  static void sync_cmos_clock(struct work_struct *work);  static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -300,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)  	 * called as close as possible to 500 ms before the new second starts.  	 * This code is run on a timer.  If the clock is set, that timer  	 * may not expire at the correct time.  Thus, we adjust... +	 * We want the clock to be within a couple of ticks from the target.  	 */  	if (!ntp_synced()) {  		/* @@ -310,14 +486,26 @@ static void sync_cmos_clock(struct work_struct *work)  	}  	getnstimeofday(&now); -	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) -		fail = update_persistent_clock(now); +	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { +		struct timespec adjust = now; + +		fail = -ENODEV; +		if (persistent_clock_is_local) +			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +#ifdef CONFIG_GENERIC_CMOS_UPDATE +		fail = update_persistent_clock(adjust); +#endif +#ifdef CONFIG_RTC_SYSTOHC +		if (fail == -ENODEV) +			fail = rtc_set_ntp_time(adjust); +#endif +	}  	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);  	if (next.tv_nsec <= 0)  		next.tv_nsec += NSEC_PER_SEC; -	if (!fail) +	if (!fail || fail == -ENODEV)  		next.tv_sec = 659;  	else  		next.tv_sec = 0; @@ -326,40 +514,19 @@ static void sync_cmos_clock(struct work_struct *work)  		next.tv_sec++;  		next.tv_nsec -= NSEC_PER_SEC;  	} -	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); +	queue_delayed_work(system_power_efficient_wq, +			   &sync_cmos_work, timespec_to_jiffies(&next));  } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void)  { -	if (!no_sync_cmos_clock) -		schedule_delayed_work(&sync_cmos_work, 0); +	queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);  }  #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { }  #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ -	long now = ts->tv_sec; - -	if (time_status & STA_INS) { -		time_state = TIME_INS; -		now += 86400 - now % 86400; -		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - -		return; -	} - -	if (time_status & STA_DEL) { -		time_state = TIME_DEL; -		now += 86400 - (now + 1) % 86400; -		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); -	} -}  /*   * Propagate a new txc->status value into the NTP state: @@ -369,6 +536,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)  	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {  		time_state = TIME_OK;  		time_status = STA_UNSYNC; +		/* restart PPS frequency calibration */ +		pps_reset_freq_interval();  	}  	/* @@ -381,29 +550,12 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)  	/* only set allowed bits */  	time_status &= STA_RONLY;  	time_status |= txc->status & ~STA_RONLY; - -	switch (time_state) { -	case TIME_OK: -		ntp_start_leap_timer(ts); -		break; -	case TIME_INS: -	case TIME_DEL: -		time_state = TIME_OK; -		ntp_start_leap_timer(ts); -	case TIME_WAIT: -		if (!(time_status & (STA_INS | STA_DEL))) -			time_state = TIME_OK; -		break; -	case TIME_OOP: -		hrtimer_restart(&leap_timer); -		break; -	}  } -/* - * Called with the xtime lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + + +static inline void process_adjtimex_modes(struct timex *txc, +						struct timespec *ts, +						s32 *time_tai)  {  	if (txc->modes & ADJ_STATUS)  		process_adj_status(txc, ts); @@ -418,6 +570,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  		time_freq = txc->freq * PPM_SCALE;  		time_freq = min(time_freq, MAXFREQ_SCALED);  		time_freq = max(time_freq, -MAXFREQ_SCALED); +		/* update pps_freq */ +		pps_set_freq(time_freq);  	}  	if (txc->modes & ADJ_MAXERROR) @@ -435,7 +589,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  	}  	if (txc->modes & ADJ_TAI && txc->constant > 0) -		time_tai = txc->constant; +		*time_tai = txc->constant;  	if (txc->modes & ADJ_OFFSET)  		ntp_update_offset(txc->offset); @@ -447,16 +601,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  		ntp_update_frequency();  } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex   */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc)  { -	struct timespec ts; -	int result; - -	/* Validate the data before disabling interrupts */  	if (txc->modes & ADJ_ADJTIME) {  		/* singleshot must not be used with any other mode bits */  		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -468,7 +619,6 @@ int do_adjtimex(struct timex *txc)  		/* In order to modify anything, you gotta be super-user! */  		 if (txc->modes && !capable(CAP_SYS_TIME))  			return -EPERM; -  		/*  		 * if the quartz is off by more than 10% then  		 * something is VERY wrong! @@ -477,14 +627,22 @@ int do_adjtimex(struct timex *txc)  		    (txc->tick <  900000/USER_HZ ||  		     txc->tick > 1100000/USER_HZ))  			return -EINVAL; - -		if (txc->modes & ADJ_STATUS && time_state != TIME_OK) -			hrtimer_cancel(&leap_timer);  	} -	getnstimeofday(&ts); +	if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) +		return -EPERM; + +	return 0; +} + -	write_seqlock_irq(&xtime_lock); +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ +	int result;  	if (txc->modes & ADJ_ADJTIME) {  		long save_adjust = time_adjust; @@ -499,7 +657,7 @@ int do_adjtimex(struct timex *txc)  		/* If there are input parameters, then process them: */  		if (txc->modes) -			process_adjtimex_modes(txc, &ts); +			process_adjtimex_modes(txc, ts, time_tai);  		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,  				  NTP_SCALE_SHIFT); @@ -508,7 +666,8 @@ int do_adjtimex(struct timex *txc)  	}  	result = time_state;	/* mostly `TIME_OK' */ -	if (time_status & (STA_UNSYNC|STA_CLOCKERR)) +	/* check for errors */ +	if (is_error_status(time_status))  		result = TIME_ERROR;  	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -520,33 +679,257 @@ int do_adjtimex(struct timex *txc)  	txc->precision	   = 1;  	txc->tolerance	   = MAXFREQ_SCALED / PPM_SCALE;  	txc->tick	   = tick_usec; -	txc->tai	   = time_tai; - -	/* PPS is not implemented, so these are zero */ -	txc->ppsfreq	   = 0; -	txc->jitter	   = 0; -	txc->shift	   = 0; -	txc->stabil	   = 0; -	txc->jitcnt	   = 0; -	txc->calcnt	   = 0; -	txc->errcnt	   = 0; -	txc->stbcnt	   = 0; +	txc->tai	   = *time_tai; -	write_sequnlock_irq(&xtime_lock); +	/* fill PPS status fields */ +	pps_fill_timex(txc); -	txc->time.tv_sec = ts.tv_sec; -	txc->time.tv_usec = ts.tv_nsec; +	txc->time.tv_sec = ts->tv_sec; +	txc->time.tv_usec = ts->tv_nsec;  	if (!(time_status & STA_NANO))  		txc->time.tv_usec /= NSEC_PER_USEC; -	notify_cmos_timer(); -  	return result;  } +#ifdef	CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { +	__kernel_time_t	sec;	/* seconds */ +	long		nsec;	/* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the +   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ +	struct pps_normtime norm = { +		.sec = ts.tv_sec, +		.nsec = ts.tv_nsec +	}; + +	if (norm.nsec > (NSEC_PER_SEC >> 1)) { +		norm.nsec -= NSEC_PER_SEC; +		norm.sec++; +	} + +	return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ +	*jitter = pps_tf[0] - pps_tf[1]; +	if (*jitter < 0) +		*jitter = -*jitter; + +	/* TODO: test various filters */ +	return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ +	pps_tf[2] = pps_tf[1]; +	pps_tf[1] = pps_tf[0]; +	pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ +	if (--pps_intcnt <= -PPS_INTCOUNT) { +		pps_intcnt = -PPS_INTCOUNT; +		if (pps_shift > PPS_INTMIN) { +			pps_shift--; +			pps_intcnt = 0; +		} +	} +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ +	if (++pps_intcnt >= PPS_INTCOUNT) { +		pps_intcnt = PPS_INTCOUNT; +		if (pps_shift < PPS_INTMAX) { +			pps_shift++; +			pps_intcnt = 0; +		} +	} +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ +	long delta, delta_mod; +	s64 ftemp; + +	/* check if the frequency interval was too long */ +	if (freq_norm.sec > (2 << pps_shift)) { +		time_status |= STA_PPSERROR; +		pps_errcnt++; +		pps_dec_freq_interval(); +		printk_deferred(KERN_ERR +			"hardpps: PPSERROR: interval too long - %ld s\n", +			freq_norm.sec); +		return 0; +	} + +	/* here the raw frequency offset and wander (stability) is +	 * calculated. If the wander is less than the wander threshold +	 * the interval is increased; otherwise it is decreased. +	 */ +	ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, +			freq_norm.sec); +	delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); +	pps_freq = ftemp; +	if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { +		printk_deferred(KERN_WARNING +				"hardpps: PPSWANDER: change=%ld\n", delta); +		time_status |= STA_PPSWANDER; +		pps_stbcnt++; +		pps_dec_freq_interval(); +	} else {	/* good sample */ +		pps_inc_freq_interval(); +	} + +	/* the stability metric is calculated as the average of recent +	 * frequency changes, but is used only for performance +	 * monitoring +	 */ +	delta_mod = delta; +	if (delta_mod < 0) +		delta_mod = -delta_mod; +	pps_stabil += (div_s64(((s64)delta_mod) << +				(NTP_SCALE_SHIFT - SHIFT_USEC), +				NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + +	/* if enabled, the system clock frequency is updated */ +	if ((time_status & STA_PPSFREQ) != 0 && +	    (time_status & STA_FREQHOLD) == 0) { +		time_freq = pps_freq; +		ntp_update_frequency(); +	} + +	return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ +	long correction = -error; +	long jitter; + +	/* add the sample to the median filter */ +	pps_phase_filter_add(correction); +	correction = pps_phase_filter_get(&jitter); + +	/* Nominal jitter is due to PPS signal noise. If it exceeds the +	 * threshold, the sample is discarded; otherwise, if so enabled, +	 * the time offset is updated. +	 */ +	if (jitter > (pps_jitter << PPS_POPCORN)) { +		printk_deferred(KERN_WARNING +				"hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", +				jitter, (pps_jitter << PPS_POPCORN)); +		time_status |= STA_PPSJITTER; +		pps_jitcnt++; +	} else if (time_status & STA_PPSTIME) { +		/* correct the time using the phase offset */ +		time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, +				NTP_INTERVAL_FREQ); +		/* cancel running adjtime() */ +		time_adjust = 0; +	} +	/* update jitter */ +	pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * __hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ +	struct pps_normtime pts_norm, freq_norm; + +	pts_norm = pps_normalize_ts(*phase_ts); + +	/* clear the error bits, they will be set again if needed */ +	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + +	/* indicate signal presence */ +	time_status |= STA_PPSSIGNAL; +	pps_valid = PPS_VALID; + +	/* when called for the first time, +	 * just start the frequency interval */ +	if (unlikely(pps_fbase.tv_sec == 0)) { +		pps_fbase = *raw_ts; +		return; +	} + +	/* ok, now we have a base for frequency calculation */ +	freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + +	/* check that the signal is in the range +	 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ +	if ((freq_norm.sec == 0) || +			(freq_norm.nsec > MAXFREQ * freq_norm.sec) || +			(freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { +		time_status |= STA_PPSJITTER; +		/* restart the frequency calibration interval */ +		pps_fbase = *raw_ts; +		printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); +		return; +	} + +	/* signal is ok */ + +	/* check if the current frequency interval is finished */ +	if (freq_norm.sec >= (1 << pps_shift)) { +		pps_calcnt++; +		/* restart the frequency calibration interval */ +		pps_fbase = *raw_ts; +		hardpps_update_freq(freq_norm); +	} + +	hardpps_update_phase(pts_norm.nsec); + +} +#endif	/* CONFIG_NTP_PPS */ +  static int __init ntp_tick_adj_setup(char *str)  { -	ntp_tick_adj = simple_strtol(str, NULL, 0); +	int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + +	if (rc) +		return rc;  	ntp_tick_adj <<= NTP_SCALE_SHIFT;  	return 1; @@ -557,6 +940,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);  void __init ntp_init(void)  {  	ntp_clear(); -	hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); -	leap_timer.function = ntp_leap_second;  } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 00000000000..1950cb4ca2a --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 00000000000..ce033c7aa2e --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,446 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License as published by + *  the Free Software Foundation; either version 2 of the License, or + *  (at your option) any later version. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program; if not, write to the Free Software + *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/export.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ +	struct posix_clock *clk = fp->private_data; + +	down_read(&clk->rwsem); + +	if (!clk->zombie) +		return clk; + +	up_read(&clk->rwsem); + +	return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ +	up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, +				size_t count, loff_t *ppos) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int err = -EINVAL; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.read) +		err = clk->ops.read(clk, fp->f_flags, buf, count); + +	put_posix_clock(clk); + +	return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int result = 0; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.poll) +		result = clk->ops.poll(clk, fp, wait); + +	put_posix_clock(clk); + +	return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int err = 0; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.fasync) +		err = clk->ops.fasync(clk, fd, fp, on); + +	put_posix_clock(clk); + +	return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int err = -ENODEV; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.mmap) +		err = clk->ops.mmap(clk, vma); + +	put_posix_clock(clk); + +	return err; +} + +static long posix_clock_ioctl(struct file *fp, +			      unsigned int cmd, unsigned long arg) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int err = -ENOTTY; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.ioctl) +		err = clk->ops.ioctl(clk, cmd, arg); + +	put_posix_clock(clk); + +	return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, +				     unsigned int cmd, unsigned long arg) +{ +	struct posix_clock *clk = get_posix_clock(fp); +	int err = -ENOTTY; + +	if (!clk) +		return -ENODEV; + +	if (clk->ops.ioctl) +		err = clk->ops.ioctl(clk, cmd, arg); + +	put_posix_clock(clk); + +	return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ +	int err; +	struct posix_clock *clk = +		container_of(inode->i_cdev, struct posix_clock, cdev); + +	down_read(&clk->rwsem); + +	if (clk->zombie) { +		err = -ENODEV; +		goto out; +	} +	if (clk->ops.open) +		err = clk->ops.open(clk, fp->f_mode); +	else +		err = 0; + +	if (!err) { +		kref_get(&clk->kref); +		fp->private_data = clk; +	} +out: +	up_read(&clk->rwsem); +	return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ +	struct posix_clock *clk = fp->private_data; +	int err = 0; + +	if (clk->ops.release) +		err = clk->ops.release(clk); + +	kref_put(&clk->kref, delete_clock); + +	fp->private_data = NULL; + +	return err; +} + +static const struct file_operations posix_clock_file_operations = { +	.owner		= THIS_MODULE, +	.llseek		= no_llseek, +	.read		= posix_clock_read, +	.poll		= posix_clock_poll, +	.unlocked_ioctl	= posix_clock_ioctl, +	.open		= posix_clock_open, +	.release	= posix_clock_release, +	.fasync		= posix_clock_fasync, +	.mmap		= posix_clock_mmap, +#ifdef CONFIG_COMPAT +	.compat_ioctl	= posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ +	int err; + +	kref_init(&clk->kref); +	init_rwsem(&clk->rwsem); + +	cdev_init(&clk->cdev, &posix_clock_file_operations); +	clk->cdev.owner = clk->ops.owner; +	err = cdev_add(&clk->cdev, devid, 1); + +	return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ +	struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + +	if (clk->release) +		clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ +	cdev_del(&clk->cdev); + +	down_write(&clk->rwsem); +	clk->zombie = true; +	up_write(&clk->rwsem); + +	kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { +	struct file *fp; +	struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ +	struct file *fp = fget(CLOCKID_TO_FD(id)); +	int err = -EINVAL; + +	if (!fp) +		return err; + +	if (fp->f_op->open != posix_clock_open || !fp->private_data) +		goto out; + +	cd->fp = fp; +	cd->clk = get_posix_clock(fp); + +	err = cd->clk ? 0 : -ENODEV; +out: +	if (err) +		fput(fp); +	return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ +	put_posix_clock(cd->clk); +	fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if ((cd.fp->f_mode & FMODE_WRITE) == 0) { +		err = -EACCES; +		goto out; +	} + +	if (cd.clk->ops.clock_adjtime) +		err = cd.clk->ops.clock_adjtime(cd.clk, tx); +	else +		err = -EOPNOTSUPP; +out: +	put_clock_desc(&cd); + +	return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if (cd.clk->ops.clock_gettime) +		err = cd.clk->ops.clock_gettime(cd.clk, ts); +	else +		err = -EOPNOTSUPP; + +	put_clock_desc(&cd); + +	return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if (cd.clk->ops.clock_getres) +		err = cd.clk->ops.clock_getres(cd.clk, ts); +	else +		err = -EOPNOTSUPP; + +	put_clock_desc(&cd); + +	return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if ((cd.fp->f_mode & FMODE_WRITE) == 0) { +		err = -EACCES; +		goto out; +	} + +	if (cd.clk->ops.clock_settime) +		err = cd.clk->ops.clock_settime(cd.clk, ts); +	else +		err = -EOPNOTSUPP; +out: +	put_clock_desc(&cd); + +	return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ +	clockid_t id = kit->it_clock; +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if (cd.clk->ops.timer_create) +		err = cd.clk->ops.timer_create(cd.clk, kit); +	else +		err = -EOPNOTSUPP; + +	put_clock_desc(&cd); + +	return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ +	clockid_t id = kit->it_clock; +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if (cd.clk->ops.timer_delete) +		err = cd.clk->ops.timer_delete(cd.clk, kit); +	else +		err = -EOPNOTSUPP; + +	put_clock_desc(&cd); + +	return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ +	clockid_t id = kit->it_clock; +	struct posix_clock_desc cd; + +	if (get_clock_desc(id, &cd)) +		return; + +	if (cd.clk->ops.timer_gettime) +		cd.clk->ops.timer_gettime(cd.clk, kit, ts); + +	put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, +			    struct itimerspec *ts, struct itimerspec *old) +{ +	clockid_t id = kit->it_clock; +	struct posix_clock_desc cd; +	int err; + +	err = get_clock_desc(id, &cd); +	if (err) +		return err; + +	if (cd.clk->ops.timer_settime) +		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); +	else +		err = -EOPNOTSUPP; + +	put_clock_desc(&cd); + +	return err; +} + +struct k_clock clock_posix_dynamic = { +	.clock_getres	= pc_clock_getres, +	.clock_set	= pc_clock_settime, +	.clock_get	= pc_clock_gettime, +	.clock_adj	= pc_clock_adjtime, +	.timer_create	= pc_timer_create, +	.timer_set	= pc_timer_settime, +	.timer_del	= pc_timer_delete, +	.timer_get	= pc_timer_gettime, +}; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 00000000000..01d2d15aa66 --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,217 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/ktime.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/hrtimer.h> +#include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> + +struct clock_data { +	ktime_t wrap_kt; +	u64 epoch_ns; +	u64 epoch_cyc; +	seqcount_t seq; +	unsigned long rate; +	u32 mult; +	u32 shift; +	bool suspended; +}; + +static struct hrtimer sched_clock_timer; +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { +	.mult	= NSEC_PER_SEC / HZ, +}; + +static u64 __read_mostly sched_clock_mask; + +static u64 notrace jiffy_sched_clock_read(void) +{ +	/* +	 * We don't need to use get_jiffies_64 on 32-bit arches here +	 * because we register with BITS_PER_LONG +	 */ +	return (u64)(jiffies - INITIAL_JIFFIES); +} + +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ +	return (cyc * mult) >> shift; +} + +unsigned long long notrace sched_clock(void) +{ +	u64 epoch_ns; +	u64 epoch_cyc; +	u64 cyc; +	unsigned long seq; + +	if (cd.suspended) +		return cd.epoch_ns; + +	do { +		seq = raw_read_seqcount_begin(&cd.seq); +		epoch_cyc = cd.epoch_cyc; +		epoch_ns = cd.epoch_ns; +	} while (read_seqcount_retry(&cd.seq, seq)); + +	cyc = read_sched_clock(); +	cyc = (cyc - epoch_cyc) & sched_clock_mask; +	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ +	unsigned long flags; +	u64 cyc; +	u64 ns; + +	cyc = read_sched_clock(); +	ns = cd.epoch_ns + +		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, +			  cd.mult, cd.shift); + +	raw_local_irq_save(flags); +	raw_write_seqcount_begin(&cd.seq); +	cd.epoch_ns = ns; +	cd.epoch_cyc = cyc; +	raw_write_seqcount_end(&cd.seq); +	raw_local_irq_restore(flags); +} + +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +{ +	update_sched_clock(); +	hrtimer_forward_now(hrt, cd.wrap_kt); +	return HRTIMER_RESTART; +} + +void __init sched_clock_register(u64 (*read)(void), int bits, +				 unsigned long rate) +{ +	u64 res, wrap, new_mask, new_epoch, cyc, ns; +	u32 new_mult, new_shift; +	ktime_t new_wrap_kt; +	unsigned long r; +	char r_unit; + +	if (cd.rate > rate) +		return; + +	WARN_ON(!irqs_disabled()); + +	/* calculate the mult/shift to convert counter ticks to ns. */ +	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + +	new_mask = CLOCKSOURCE_MASK(bits); + +	/* calculate how many ns until we wrap */ +	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); +	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + +	/* update epoch for new counter and update epoch_ns from old counter*/ +	new_epoch = read(); +	cyc = read_sched_clock(); +	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, +			  cd.mult, cd.shift); + +	raw_write_seqcount_begin(&cd.seq); +	read_sched_clock = read; +	sched_clock_mask = new_mask; +	cd.rate = rate; +	cd.wrap_kt = new_wrap_kt; +	cd.mult = new_mult; +	cd.shift = new_shift; +	cd.epoch_cyc = new_epoch; +	cd.epoch_ns = ns; +	raw_write_seqcount_end(&cd.seq); + +	r = rate; +	if (r >= 4000000) { +		r /= 1000000; +		r_unit = 'M'; +	} else if (r >= 1000) { +		r /= 1000; +		r_unit = 'k'; +	} else +		r_unit = ' '; + +	/* calculate the ns resolution of this counter */ +	res = cyc_to_ns(1ULL, new_mult, new_shift); + +	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", +		bits, r, r_unit, res, wrap); + +	/* Enable IRQ time accounting if we have a fast enough sched_clock */ +	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) +		enable_sched_clock_irqtime(); + +	pr_debug("Registered %pF as sched_clock source\n", read); +} + +void __init sched_clock_postinit(void) +{ +	/* +	 * If no sched_clock function has been provided at that point, +	 * make it the final one one. +	 */ +	if (read_sched_clock == jiffy_sched_clock_read) +		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + +	update_sched_clock(); + +	/* +	 * Start the timer to keep sched_clock() properly updated and +	 * sets the initial epoch. +	 */ +	hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	sched_clock_timer.function = sched_clock_poll; +	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); +} + +static int sched_clock_suspend(void) +{ +	update_sched_clock(); +	hrtimer_cancel(&sched_clock_timer); +	cd.suspended = true; +	return 0; +} + +static void sched_clock_resume(void) +{ +	cd.epoch_cyc = read_sched_clock(); +	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); +	cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { +	.suspend = sched_clock_suspend, +	.resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ +	register_syscore_ops(&sched_clock_ops); +	return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 00000000000..eb682d5c697 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, +			struct clock_event_device *bc) +{ +	switch (mode) { +	case CLOCK_EVT_MODE_SHUTDOWN: +		/* +		 * Note, we cannot cancel the timer here as we might +		 * run into the following live lock scenario: +		 * +		 * cpu 0		cpu1 +		 * lock(broadcast_lock); +		 *			hrtimer_interrupt() +		 *			bc_handler() +		 *			   tick_handle_oneshot_broadcast(); +		 *			    lock(broadcast_lock); +		 * hrtimer_cancel() +		 *  wait_for_callback() +		 */ +		hrtimer_try_to_cancel(&bctimer); +		break; +	default: +		break; +	} +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ +	/* +	 * We try to cancel the timer first. If the callback is on +	 * flight on some other cpu then we let it handle it. If we +	 * were able to cancel the timer nothing can rearm it as we +	 * own broadcast_lock. +	 * +	 * However we can also be called from the event handler of +	 * ce_broadcast_hrtimer itself when it expires. We cannot +	 * restart the timer because we are in the callback, but we +	 * can set the expiry time and let the callback return +	 * HRTIMER_RESTART. +	 */ +	if (hrtimer_try_to_cancel(&bctimer) >= 0) { +		hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); +		/* Bind the "device" to the cpu */ +		bc->bound_on = smp_processor_id(); +	} else if (bc->bound_on == smp_processor_id()) { +		hrtimer_set_expires(&bctimer, expires); +	} +	return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { +	.set_mode		= bc_set_mode, +	.set_next_ktime		= bc_set_next, +	.features		= CLOCK_EVT_FEAT_ONESHOT | +				  CLOCK_EVT_FEAT_KTIME | +				  CLOCK_EVT_FEAT_HRTIMER, +	.rating			= 0, +	.bound_on		= -1, +	.min_delta_ns		= 1, +	.max_delta_ns		= KTIME_MAX, +	.min_delta_ticks	= 1, +	.max_delta_ticks	= ULONG_MAX, +	.mult			= 1, +	.shift			= 0, +	.cpumask		= cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ +	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + +	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) +		return HRTIMER_NORESTART; + +	return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ +	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +	bctimer.function = bc_handler; +	clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b566..64c5990fd50 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,8 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> -#include <linux/tick.h> +#include <linux/smp.h> +#include <linux/module.h>  #include "tick-internal.h" @@ -28,9 +29,9 @@   */  static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; +static cpumask_var_t tmpmask;  static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  static int tick_broadcast_force; @@ -50,7 +51,7 @@ struct tick_device *tick_get_broadcast_device(void)  struct cpumask *tick_get_broadcast_mask(void)  { -	return to_cpumask(tick_broadcast_mask); +	return tick_broadcast_mask;  }  /* @@ -65,18 +66,50 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)  /*   * Check, if the device can be utilized as broadcast device:   */ -int tick_check_broadcast_device(struct clock_event_device *dev) +static bool tick_check_broadcast_device(struct clock_event_device *curdev, +					struct clock_event_device *newdev)  { -	if ((tick_broadcast_device.evtdev && -	     tick_broadcast_device.evtdev->rating >= dev->rating) || -	     (dev->features & CLOCK_EVT_FEAT_C3STOP)) -		return 0; +	if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || +	    (newdev->features & CLOCK_EVT_FEAT_PERCPU) || +	    (newdev->features & CLOCK_EVT_FEAT_C3STOP)) +		return false; + +	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && +	    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) +		return false; + +	return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev) +{ +	struct clock_event_device *cur = tick_broadcast_device.evtdev; + +	if (!tick_check_broadcast_device(cur, dev)) +		return; -	clockevents_exchange_device(NULL, dev); +	if (!try_module_get(dev->owner)) +		return; + +	clockevents_exchange_device(cur, dev); +	if (cur) +		cur->event_handler = clockevents_handle_noop;  	tick_broadcast_device.evtdev = dev; -	if (!cpumask_empty(tick_get_broadcast_mask())) +	if (!cpumask_empty(tick_broadcast_mask))  		tick_broadcast_start_periodic(dev); -	return 1; +	/* +	 * Inform all cpus about this. We might be in a situation +	 * where we did not switch to oneshot mode because the per cpu +	 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack +	 * of a oneshot capable broadcast device. Without that +	 * notification the systems stays stuck in periodic mode +	 * forever. +	 */ +	if (dev->features & CLOCK_EVT_FEAT_ONESHOT) +		tick_clock_notify();  }  /* @@ -87,14 +120,44 @@ int tick_is_broadcast_device(struct clock_event_device *dev)  	return (dev && tick_broadcast_device.evtdev == dev);  } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ +	int ret = -ENODEV; + +	if (tick_is_broadcast_device(dev)) { +		raw_spin_lock(&tick_broadcast_lock); +		ret = __clockevents_update_freq(dev, freq); +		raw_spin_unlock(&tick_broadcast_lock); +	} +	return ret; +} + + +static void err_broadcast(const struct cpumask *mask) +{ +	pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ +	if (!dev->broadcast) +		dev->broadcast = tick_broadcast; +	if (!dev->broadcast) { +		pr_warn_once("%s depends on broadcast, but no broadcast function available\n", +			     dev->name); +		dev->broadcast = err_broadcast; +	} +} +  /*   * Check, if the device is disfunctional and a place holder, which   * needs to be handled by the broadcast device.   */  int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  { +	struct clock_event_device *bc = tick_broadcast_device.evtdev;  	unsigned long flags; -	int ret = 0; +	int ret;  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -106,26 +169,87 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  	 */  	if (!tick_device_is_functional(dev)) {  		dev->event_handler = tick_handle_periodic; -		cpumask_set_cpu(cpu, tick_get_broadcast_mask()); -		tick_broadcast_start_periodic(tick_broadcast_device.evtdev); +		tick_device_setup_broadcast_func(dev); +		cpumask_set_cpu(cpu, tick_broadcast_mask); +		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) +			tick_broadcast_start_periodic(bc); +		else +			tick_broadcast_setup_oneshot(bc);  		ret = 1;  	} else {  		/* -		 * When the new device is not affected by the stop -		 * feature and the cpu is marked in the broadcast mask -		 * then clear the broadcast bit. +		 * Clear the broadcast bit for this cpu if the +		 * device is not power state affected.  		 */ -		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { -			int cpu = smp_processor_id(); +		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) +			cpumask_clear_cpu(cpu, tick_broadcast_mask); +		else +			tick_device_setup_broadcast_func(dev); -			cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +		/* +		 * Clear the broadcast bit if the CPU is not in +		 * periodic broadcast on state. +		 */ +		if (!cpumask_test_cpu(cpu, tick_broadcast_on)) +			cpumask_clear_cpu(cpu, tick_broadcast_mask); + +		switch (tick_broadcast_device.mode) { +		case TICKDEV_MODE_ONESHOT: +			/* +			 * If the system is in oneshot mode we can +			 * unconditionally clear the oneshot mask bit, +			 * because the CPU is running and therefore +			 * not in an idle state which causes the power +			 * state affected device to stop. Let the +			 * caller initialize the device. +			 */  			tick_broadcast_clear_oneshot(cpu); +			ret = 0; +			break; + +		case TICKDEV_MODE_PERIODIC: +			/* +			 * If the system is in periodic mode, check +			 * whether the broadcast device can be +			 * switched off now. +			 */ +			if (cpumask_empty(tick_broadcast_mask) && bc) +				clockevents_shutdown(bc); +			/* +			 * If we kept the cpu in the broadcast mask, +			 * tell the caller to leave the per cpu device +			 * in shutdown state. The periodic interrupt +			 * is delivered by the broadcast device. +			 */ +			ret = cpumask_test_cpu(cpu, tick_broadcast_mask); +			break; +		default: +			/* Nothing to do */ +			ret = 0; +			break;  		}  	}  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  	return ret;  } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ +	struct tick_device *td = this_cpu_ptr(&tick_cpu_device); +	struct clock_event_device *evt = td->evtdev; + +	if (!evt) +		return -ENODEV; + +	if (!evt->event_handler) +		return -EINVAL; + +	evt->event_handler(evt); +	return 0; +} +#endif +  /*   * Broadcast the event to the cpus, which are set in the mask (mangled).   */ @@ -161,13 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask)   */  static void tick_do_periodic_broadcast(void)  { -	raw_spin_lock(&tick_broadcast_lock); - -	cpumask_and(to_cpumask(tmpmask), -		    cpu_online_mask, tick_get_broadcast_mask()); -	tick_do_broadcast(to_cpumask(tmpmask)); - -	raw_spin_unlock(&tick_broadcast_lock); +	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); +	tick_do_broadcast(tmpmask);  }  /* @@ -177,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)  {  	ktime_t next; +	raw_spin_lock(&tick_broadcast_lock); +  	tick_do_periodic_broadcast();  	/*  	 * The device is in periodic mode. No reprogramming necessary:  	 */  	if (dev->mode == CLOCK_EVT_MODE_PERIODIC) -		return; +		goto unlock;  	/*  	 * Setup the next period for devices, which do not have @@ -195,10 +316,12 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)  	for (next = dev->next_event; ;) {  		next = ktime_add(next, tick_period); -		if (!clockevents_program_event(dev, next, ktime_get())) -			return; +		if (!clockevents_program_event(dev, next, false)) +			goto unlock;  		tick_do_periodic_broadcast();  	} +unlock: +	raw_spin_unlock(&tick_broadcast_lock);  }  /* @@ -228,13 +351,13 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  	if (!tick_device_is_functional(dev))  		goto out; -	bc_stopped = cpumask_empty(tick_get_broadcast_mask()); +	bc_stopped = cpumask_empty(tick_broadcast_mask);  	switch (*reason) {  	case CLOCK_EVT_NOTIFY_BROADCAST_ON:  	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: -		if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { -			cpumask_set_cpu(cpu, tick_get_broadcast_mask()); +		cpumask_set_cpu(cpu, tick_broadcast_on); +		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC)  				clockevents_shutdown(dev); @@ -243,9 +366,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  			tick_broadcast_force = 1;  		break;  	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: -		if (!tick_broadcast_force && -		    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { -			cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +		if (tick_broadcast_force) +			break; +		cpumask_clear_cpu(cpu, tick_broadcast_on); +		if (!tick_device_is_functional(dev)) +			break; +		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC)  				tick_setup_periodic(dev, 0); @@ -253,7 +379,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  		break;  	} -	if (cpumask_empty(tick_get_broadcast_mask())) { +	if (cpumask_empty(tick_broadcast_mask)) {  		if (!bc_stopped)  			clockevents_shutdown(bc);  	} else if (bc_stopped) { @@ -302,10 +428,11 @@ void tick_shutdown_broadcast(unsigned int *cpup)  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	bc = tick_broadcast_device.evtdev; -	cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_on);  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { -		if (bc && cpumask_empty(tick_get_broadcast_mask())) +		if (bc && cpumask_empty(tick_broadcast_mask))  			clockevents_shutdown(bc);  	} @@ -341,13 +468,14 @@ int tick_resume_broadcast(void)  		switch (tick_broadcast_device.mode) {  		case TICKDEV_MODE_PERIODIC: -			if (!cpumask_empty(tick_get_broadcast_mask())) +			if (!cpumask_empty(tick_broadcast_mask))  				tick_broadcast_start_periodic(bc);  			broadcast = cpumask_test_cpu(smp_processor_id(), -						     tick_get_broadcast_mask()); +						     tick_broadcast_mask);  			break;  		case TICKDEV_MODE_ONESHOT: -			broadcast = tick_resume_broadcast_oneshot(bc); +			if (!cpumask_empty(tick_broadcast_mask)) +				broadcast = tick_resume_broadcast_oneshot(bc);  			break;  		}  	} @@ -359,22 +487,58 @@ int tick_resume_broadcast(void)  #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask;  /*   * Exposed for debugging: see timer_list.c   */  struct cpumask *tick_get_broadcast_oneshot_mask(void)  { -	return to_cpumask(tick_broadcast_oneshot_mask); +	return tick_broadcast_oneshot_mask;  } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void)  { -	struct clock_event_device *bc = tick_broadcast_device.evtdev; +	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, +					const struct cpumask *cpumask) +{ +	if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) +		return; + +	if (cpumask_equal(bc->cpumask, cpumask)) +		return; + +	bc->cpumask = cpumask; +	irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, +				    ktime_t expires, int force) +{ +	int ret; + +	if (bc->mode != CLOCK_EVT_MODE_ONESHOT) +		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); -	return tick_dev_program_event(bc, expires, force); +	ret = clockevents_program_event(bc, expires, force); +	if (!ret) +		tick_broadcast_set_affinity(bc, cpumask_of(cpu)); +	return ret;  }  int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -387,12 +551,20 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)   * Called from irq_enter() when idle was interrupted to reenable the   * per cpu device.   */ -void tick_check_oneshot_broadcast(int cpu) +void tick_check_oneshot_broadcast_this_cpu(void)  { -	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { -		struct tick_device *td = &per_cpu(tick_cpu_device, cpu); +	if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { +		struct tick_device *td = &__get_cpu_var(tick_cpu_device); -		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); +		/* +		 * We might be in the middle of switching over from +		 * periodic to oneshot. If the CPU has not yet +		 * switched over, leave the device alone. +		 */ +		if (td->mode == TICKDEV_MODE_ONESHOT) { +			clockevents_set_mode(td->evtdev, +					     CLOCK_EVT_MODE_ONESHOT); +		}  	}  } @@ -403,27 +575,52 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)  {  	struct tick_device *td;  	ktime_t now, next_event; -	int cpu; +	int cpu, next_cpu = 0;  	raw_spin_lock(&tick_broadcast_lock);  again:  	dev->next_event.tv64 = KTIME_MAX;  	next_event.tv64 = KTIME_MAX; -	cpumask_clear(to_cpumask(tmpmask)); +	cpumask_clear(tmpmask);  	now = ktime_get();  	/* Find all expired events */ -	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { +	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {  		td = &per_cpu(tick_cpu_device, cpu); -		if (td->evtdev->next_event.tv64 <= now.tv64) -			cpumask_set_cpu(cpu, to_cpumask(tmpmask)); -		else if (td->evtdev->next_event.tv64 < next_event.tv64) +		if (td->evtdev->next_event.tv64 <= now.tv64) { +			cpumask_set_cpu(cpu, tmpmask); +			/* +			 * Mark the remote cpu in the pending mask, so +			 * it can avoid reprogramming the cpu local +			 * timer in tick_broadcast_oneshot_control(). +			 */ +			cpumask_set_cpu(cpu, tick_broadcast_pending_mask); +		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {  			next_event.tv64 = td->evtdev->next_event.tv64; +			next_cpu = cpu; +		}  	}  	/* +	 * Remove the current cpu from the pending mask. The event is +	 * delivered immediately in tick_do_broadcast() ! +	 */ +	cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + +	/* Take care of enforced broadcast requests */ +	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); +	cpumask_clear(tick_broadcast_force_mask); + +	/* +	 * Sanity check. Catch the case where we try to broadcast to +	 * offline cpus. +	 */ +	if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) +		cpumask_and(tmpmask, tmpmask, cpu_online_mask); + +	/*  	 * Wakeup the cpus which have an expired event.  	 */ -	tick_do_broadcast(to_cpumask(tmpmask)); +	tick_do_broadcast(tmpmask);  	/*  	 * Two reasons for reprogram: @@ -440,59 +637,176 @@ again:  		 * Rearm the broadcast device. If event expired,  		 * repeat the above  		 */ -		if (tick_broadcast_set_event(next_event, 0)) +		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))  			goto again;  	}  	raw_spin_unlock(&tick_broadcast_lock);  } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ +	if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) +		return 0; +	if (bc->next_event.tv64 == KTIME_MAX) +		return 0; +	return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, +				     struct clock_event_device *dev) +{ +	/* +	 * For hrtimer based broadcasting we cannot shutdown the cpu +	 * local device if our own event is the first one to expire or +	 * if we own the broadcast timer. +	 */ +	if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { +		if (broadcast_needs_cpu(bc, smp_processor_id())) +			return; +		if (dev->next_event.tv64 < bc->next_event.tv64) +			return; +	} +	clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ +	struct clock_event_device *bc = tick_broadcast_device.evtdev; + +	if (!bc || !broadcast_needs_cpu(bc, deadcpu)) +		return; +	/* This moves the broadcast assignment to this cpu */ +	clockevents_program_event(bc, bc->next_event, 1); +} +  /*   * Powerstate information: The system enters/leaves a state, where   * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.   */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason)  {  	struct clock_event_device *bc, *dev;  	struct tick_device *td;  	unsigned long flags; -	int cpu; - -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); +	ktime_t now; +	int cpu, ret = 0;  	/*  	 * Periodic mode does not care about the enter/exit of power  	 * states  	 */  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) -		goto out; +		return 0; -	bc = tick_broadcast_device.evtdev; +	/* +	 * We are called with preemtion disabled from the depth of the +	 * idle code, so we can't be moved away. +	 */  	cpu = smp_processor_id();  	td = &per_cpu(tick_cpu_device, cpu);  	dev = td->evtdev;  	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) -		goto out; +		return 0; +	bc = tick_broadcast_device.evtdev; + +	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { -		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { -			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); -			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); -			if (dev->next_event.tv64 < bc->next_event.tv64) -				tick_broadcast_set_event(dev->next_event, 1); +		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { +			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); +			broadcast_shutdown_local(bc, dev); +			/* +			 * We only reprogram the broadcast timer if we +			 * did not mark ourself in the force mask and +			 * if the cpu local event is earlier than the +			 * broadcast event. If the current CPU is in +			 * the force mask, then we are going to be +			 * woken by the IPI right away. +			 */ +			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && +			    dev->next_event.tv64 < bc->next_event.tv64) +				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);  		} +		/* +		 * If the current CPU owns the hrtimer broadcast +		 * mechanism, it cannot go deep idle and we remove the +		 * CPU from the broadcast mask. We don't have to go +		 * through the EXIT path as the local timer is not +		 * shutdown. +		 */ +		ret = broadcast_needs_cpu(bc, cpu); +		if (ret) +			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);  	} else { -		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { -			cpumask_clear_cpu(cpu, -					  tick_get_broadcast_oneshot_mask()); +		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {  			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); -			if (dev->next_event.tv64 != KTIME_MAX) -				tick_program_event(dev->next_event, 1); +			/* +			 * The cpu which was handling the broadcast +			 * timer marked this cpu in the broadcast +			 * pending mask and fired the broadcast +			 * IPI. So we are going to handle the expired +			 * event anyway via the broadcast IPI +			 * handler. No need to reprogram the timer +			 * with an already expired event. +			 */ +			if (cpumask_test_and_clear_cpu(cpu, +				       tick_broadcast_pending_mask)) +				goto out; + +			/* +			 * Bail out if there is no next event. +			 */ +			if (dev->next_event.tv64 == KTIME_MAX) +				goto out; +			/* +			 * If the pending bit is not set, then we are +			 * either the CPU handling the broadcast +			 * interrupt or we got woken by something else. +			 * +			 * We are not longer in the broadcast mask, so +			 * if the cpu local expiry time is already +			 * reached, we would reprogram the cpu local +			 * timer with an already expired event. +			 * +			 * This can lead to a ping-pong when we return +			 * to idle and therefor rearm the broadcast +			 * timer before the cpu local timer was able +			 * to fire. This happens because the forced +			 * reprogramming makes sure that the event +			 * will happen in the future and depending on +			 * the min_delta setting this might be far +			 * enough out that the ping-pong starts. +			 * +			 * If the cpu local next_event has expired +			 * then we know that the broadcast timer +			 * next_event has expired as well and +			 * broadcast is about to be handled. So we +			 * avoid reprogramming and enforce that the +			 * broadcast handler, which did not run yet, +			 * will invoke the cpu local handler. +			 * +			 * We cannot call the handler directly from +			 * here, because we might be in a NOHZ phase +			 * and we did not go through the irq_enter() +			 * nohz fixups. +			 */ +			now = ktime_get(); +			if (dev->next_event.tv64 <= now.tv64) { +				cpumask_set_cpu(cpu, tick_broadcast_force_mask); +				goto out; +			} +			/* +			 * We got woken by something else. Reprogram +			 * the cpu local timer device. +			 */ +			tick_program_event(dev->next_event, 1);  		}  	} -  out:  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +	return ret;  }  /* @@ -502,7 +816,8 @@ out:   */  static void tick_broadcast_clear_oneshot(int cpu)  { -	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);  }  static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -523,16 +838,13 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,   */  void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  { +	int cpu = smp_processor_id(); +  	/* Set it up only once ! */  	if (bc->event_handler != tick_handle_oneshot_broadcast) {  		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; -		int cpu = smp_processor_id();  		bc->event_handler = tick_handle_oneshot_broadcast; -		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - -		/* Take the do_timer update */ -		tick_do_timer_cpu = cpu;  		/*  		 * We must be careful here. There might be other CPUs @@ -540,18 +852,27 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		 * oneshot_mask bits for those and program the  		 * broadcast device to fire.  		 */ -		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); -		cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); -		cpumask_or(tick_get_broadcast_oneshot_mask(), -			   tick_get_broadcast_oneshot_mask(), -			   to_cpumask(tmpmask)); - -		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { -			tick_broadcast_init_next_event(to_cpumask(tmpmask), +		cpumask_copy(tmpmask, tick_broadcast_mask); +		cpumask_clear_cpu(cpu, tmpmask); +		cpumask_or(tick_broadcast_oneshot_mask, +			   tick_broadcast_oneshot_mask, tmpmask); + +		if (was_periodic && !cpumask_empty(tmpmask)) { +			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); +			tick_broadcast_init_next_event(tmpmask,  						       tick_next_period); -			tick_broadcast_set_event(tick_next_period, 1); +			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);  		} else  			bc->next_event.tv64 = KTIME_MAX; +	} else { +		/* +		 * The first cpu which switches to oneshot mode sets +		 * the bit for all other cpus which are in the general +		 * (periodic) broadcast mask. So the bit is set and +		 * would prevent the first broadcast enter after this +		 * to program the bc device. +		 */ +		tick_broadcast_clear_oneshot(cpu);  	}  } @@ -569,6 +890,7 @@ void tick_broadcast_switch_to_oneshot(void)  	bc = tick_broadcast_device.evtdev;  	if (bc)  		tick_broadcast_setup_oneshot(bc); +  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } @@ -584,10 +906,14 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	/* -	 * Clear the broadcast mask flag for the dead cpu, but do not -	 * stop the broadcast device! +	 * Clear the broadcast masks for the dead cpu, but do not stop +	 * the broadcast device!  	 */ -	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + +	broadcast_move_bc(cpu);  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } @@ -600,4 +926,26 @@ int tick_broadcast_oneshot_active(void)  	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;  } +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ +	struct clock_event_device *bc = tick_broadcast_device.evtdev; + +	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} +  #endif + +void __init tick_broadcast_init(void) +{ +	zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); +	zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); +	zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT +	zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); +	zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); +	zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eee..0a0608edeb2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,7 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> -#include <linux/tick.h> +#include <linux/module.h>  #include <asm/irq_regs.h> @@ -33,8 +33,22 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);   */  ktime_t tick_next_period;  ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + *    timekeeping lock all at once. Only the CPU which is assigned to do the + *    update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + *    TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + *    at it will take over and keep the time keeping alive.  The handover + *    procedure also covers cpu hotplug. + */  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock);  /*   * Debugging: see timer_list.c @@ -49,9 +63,13 @@ struct tick_device *tick_get_device(int cpu)   */  int tick_is_oneshot_available(void)  { -	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; +	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); -	return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); +	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) +		return 0; +	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) +		return 1; +	return tick_broadcast_oneshot_available();  }  /* @@ -60,13 +78,14 @@ int tick_is_oneshot_available(void)  static void tick_periodic(int cpu)  {  	if (tick_do_timer_cpu == cpu) { -		write_seqlock(&xtime_lock); +		write_seqlock(&jiffies_lock);  		/* Keep track of the next tick event */  		tick_next_period = ktime_add(tick_next_period, tick_period);  		do_timer(1); -		write_sequnlock(&xtime_lock); +		write_sequnlock(&jiffies_lock); +		update_wall_time();  	}  	update_process_times(user_mode(get_irq_regs())); @@ -79,19 +98,20 @@ static void tick_periodic(int cpu)  void tick_handle_periodic(struct clock_event_device *dev)  {  	int cpu = smp_processor_id(); -	ktime_t next; +	ktime_t next = dev->next_event;  	tick_periodic(cpu);  	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)  		return; -	/* -	 * Setup the next period for devices, which do not have -	 * periodic mode: -	 */ -	next = ktime_add(dev->next_event, tick_period);  	for (;;) { -		if (!clockevents_program_event(dev, next, ktime_get())) +		/* +		 * Setup the next period for devices, which do not have +		 * periodic mode: +		 */ +		next = ktime_add(next, tick_period); + +		if (!clockevents_program_event(dev, next, false))  			return;  		/*  		 * Have to be careful here. If we're in oneshot mode, @@ -99,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev)  		 * to be sure we're using a real hardware clocksource.  		 * Otherwise we could get trapped in an infinite  		 * loop, as the tick_periodic() increments jiffies, -		 * when then will increment time, posibly causing +		 * which then will increment time, possibly causing  		 * the loop to trigger again and again.  		 */  		if (timekeeping_valid_for_hres())  			tick_periodic(cpu); -		next = ktime_add(next, tick_period);  	}  } @@ -127,14 +146,14 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)  		ktime_t next;  		do { -			seq = read_seqbegin(&xtime_lock); +			seq = read_seqbegin(&jiffies_lock);  			next = tick_next_period; -		} while (read_seqretry(&xtime_lock, seq)); +		} while (read_seqretry(&jiffies_lock, seq));  		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);  		for (;;) { -			if (!clockevents_program_event(dev, next, ktime_get())) +			if (!clockevents_program_event(dev, next, false))  				return;  			next = ktime_add(next, tick_period);  		} @@ -160,7 +179,10 @@ static void tick_setup_device(struct tick_device *td,  		 * this cpu:  		 */  		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { -			tick_do_timer_cpu = cpu; +			if (!tick_nohz_full_cpu(cpu)) +				tick_do_timer_cpu = cpu; +			else +				tick_do_timer_cpu = TICK_DO_TIMER_NONE;  			tick_next_period = ktime_get();  			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);  		} @@ -188,7 +210,8 @@ static void tick_setup_device(struct tick_device *td,  	 * When global broadcasting is active, check if the current  	 * device is registered as a placeholder for broadcast mode.  	 * This allows us to handle this x86 misfeature in a generic -	 * way. +	 * way. This function also returns !=0 when we keep the +	 * current active broadcast state for this CPU.  	 */  	if (tick_device_uses_broadcast(newdev, cpu))  		return; @@ -199,17 +222,75 @@ static void tick_setup_device(struct tick_device *td,  		tick_setup_oneshot(newdev, handler, next_event);  } +void tick_install_replacement(struct clock_event_device *newdev) +{ +	struct tick_device *td = &__get_cpu_var(tick_cpu_device); +	int cpu = smp_processor_id(); + +	clockevents_exchange_device(td->evtdev, newdev); +	tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); +	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) +		tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, +			      struct clock_event_device *newdev, int cpu) +{ +	if (!cpumask_test_cpu(cpu, newdev->cpumask)) +		return false; +	if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) +		return true; +	/* Check if irq affinity can be set */ +	if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) +		return false; +	/* Prefer an existing cpu local device */ +	if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) +		return false; +	return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, +				 struct clock_event_device *newdev) +{ +	/* Prefer oneshot capable device */ +	if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { +		if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) +			return false; +		if (tick_oneshot_mode_active()) +			return false; +	} + +	/* +	 * Use the higher rated one, but prefer a CPU local device with a lower +	 * rating than a non-CPU local device +	 */ +	return !curdev || +		newdev->rating > curdev->rating || +	       !cpumask_equal(curdev->cpumask, newdev->cpumask); +} +  /* - * Check, if the new registered device should be used. + * Check whether the new device is a better fit than curdev. curdev + * can be NULL !   */ -static int tick_check_new_device(struct clock_event_device *newdev) +bool tick_check_replacement(struct clock_event_device *curdev, +			    struct clock_event_device *newdev) +{ +	if (!tick_check_percpu(curdev, newdev, smp_processor_id())) +		return false; + +	return tick_check_preferred(curdev, newdev); +} + +/* + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. + */ +void tick_check_new_device(struct clock_event_device *newdev)  {  	struct clock_event_device *curdev;  	struct tick_device *td; -	int cpu, ret = NOTIFY_OK; -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_device_lock, flags); +	int cpu;  	cpu = smp_processor_id();  	if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -219,40 +300,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)  	curdev = td->evtdev;  	/* cpu local device ? */ -	if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - -		/* -		 * If the cpu affinity of the device interrupt can not -		 * be set, ignore it. -		 */ -		if (!irq_can_set_affinity(newdev->irq)) -			goto out_bc; +	if (!tick_check_percpu(curdev, newdev, cpu)) +		goto out_bc; -		/* -		 * If we have a cpu local device already, do not replace it -		 * by a non cpu local device -		 */ -		if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) -			goto out_bc; -	} +	/* Preference decision */ +	if (!tick_check_preferred(curdev, newdev)) +		goto out_bc; -	/* -	 * If we have an active device, then check the rating and the oneshot -	 * feature. -	 */ -	if (curdev) { -		/* -		 * Prefer one shot capable devices ! -		 */ -		if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && -		    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) -			goto out_bc; -		/* -		 * Check the rating -		 */ -		if (curdev->rating >= newdev->rating) -			goto out_bc; -	} +	if (!try_module_get(newdev->owner)) +		return;  	/*  	 * Replace the eventually existing device by the new @@ -267,20 +323,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)  	tick_setup_device(td, newdev, cpu, cpumask_of(cpu));  	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)  		tick_oneshot_notify(); - -	raw_spin_unlock_irqrestore(&tick_device_lock, flags); -	return NOTIFY_STOP; +	return;  out_bc:  	/*  	 * Can the new device be used as a broadcast device ?  	 */ -	if (tick_check_broadcast_device(newdev)) -		ret = NOTIFY_STOP; - -	raw_spin_unlock_irqrestore(&tick_device_lock, flags); - -	return ret; +	tick_install_broadcast_device(newdev);  }  /* @@ -288,7 +337,7 @@ out_bc:   *   * Called with interrupts disabled.   */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup)  {  	if (*cpup == tick_do_timer_cpu) {  		int cpu = cpumask_first(cpu_online_mask); @@ -305,13 +354,11 @@ static void tick_handover_do_timer(int *cpup)   * access the hardware device itself.   * We just set the mode and remove it from the lists.   */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup)  {  	struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);  	struct clock_event_device *dev = td->evtdev; -	unsigned long flags; -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	td->mode = TICKDEV_MODE_PERIODIC;  	if (dev) {  		/* @@ -320,28 +367,23 @@ static void tick_shutdown(unsigned int *cpup)  		 */  		dev->mode = CLOCK_EVT_MODE_UNUSED;  		clockevents_exchange_device(dev, NULL); +		dev->event_handler = clockevents_handle_noop;  		td->evtdev = NULL;  	} -	raw_spin_unlock_irqrestore(&tick_device_lock, flags);  } -static void tick_suspend(void) +void tick_suspend(void)  {  	struct tick_device *td = &__get_cpu_var(tick_cpu_device); -	unsigned long flags; -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	clockevents_shutdown(td->evtdev); -	raw_spin_unlock_irqrestore(&tick_device_lock, flags);  } -static void tick_resume(void) +void tick_resume(void)  {  	struct tick_device *td = &__get_cpu_var(tick_cpu_device); -	unsigned long flags;  	int broadcast = tick_resume_broadcast(); -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);  	if (!broadcast) { @@ -350,67 +392,12 @@ static void tick_resume(void)  		else  			tick_resume_oneshot();  	} -	raw_spin_unlock_irqrestore(&tick_device_lock, flags); -} - -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, -			       void *dev) -{ -	switch (reason) { - -	case CLOCK_EVT_NOTIFY_ADD: -		return tick_check_new_device(dev); - -	case CLOCK_EVT_NOTIFY_BROADCAST_ON: -	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: -	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: -		tick_broadcast_on_off(reason, dev); -		break; - -	case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: -	case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: -		tick_broadcast_oneshot_control(reason); -		break; - -	case CLOCK_EVT_NOTIFY_CPU_DYING: -		tick_handover_do_timer(dev); -		break; - -	case CLOCK_EVT_NOTIFY_CPU_DEAD: -		tick_shutdown_broadcast_oneshot(dev); -		tick_shutdown_broadcast(dev); -		tick_shutdown(dev); -		break; - -	case CLOCK_EVT_NOTIFY_SUSPEND: -		tick_suspend(); -		tick_suspend_broadcast(); -		break; - -	case CLOCK_EVT_NOTIFY_RESUME: -		tick_resume(); -		break; - -	default: -		break; -	} - -	return NOTIFY_OK;  } -static struct notifier_block tick_notifier = { -	.notifier_call = tick_notify, -}; -  /**   * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework   */  void __init tick_init(void)  { -	clockevents_register_notifier(&tick_notifier); +	tick_broadcast_init();  } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f6..7ab92b19965 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,14 @@  /*   * tick internal variable and functions used by low/high res code   */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN	32 + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD  #define TICK_DO_TIMER_NONE	-1  #define TICK_DO_TIMER_BOOT	-2 @@ -12,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);  extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, +				   struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev);  extern void clockevents_shutdown(struct clock_event_device *dev); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +  /*   * NO_HZ / high resolution timer shared code   */ @@ -22,30 +40,30 @@ extern void clockevents_shutdown(struct clock_event_device *dev);  extern void tick_setup_oneshot(struct clock_event_device *newdev,  			       void (*handler)(struct clock_event_device *),  			       ktime_t nextevt); -extern int tick_dev_program_event(struct clock_event_device *dev, -				  ktime_t expires, int force);  extern int tick_program_event(ktime_t expires, int force);  extern void tick_oneshot_notify(void);  extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));  extern void tick_resume_oneshot(void);  # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST  extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason);  extern void tick_broadcast_switch_to_oneshot(void);  extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);  extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);  extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast(int cpu); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void);  # else /* BROADCAST */  static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  {  	BUG();  } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }  static inline void tick_broadcast_switch_to_oneshot(void) { }  static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }  static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return true; }  # endif /* !BROADCAST */  #else /* !ONESHOT */ @@ -69,13 +87,14 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  {  	BUG();  } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }  static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }  static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)  {  	return 0;  }  static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline bool tick_broadcast_oneshot_available(void) { return false; }  #endif /* !TICK_ONESHOT */  /* @@ -83,21 +102,21 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }   */  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST  extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev);  extern int tick_is_broadcast_device(struct clock_event_device *dev);  extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);  extern void tick_shutdown_broadcast(unsigned int *cpup);  extern void tick_suspend_broadcast(void);  extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void);  extern void  tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);  #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev)  { -	return 0;  }  static inline int tick_is_broadcast_device(struct clock_event_device *dev) @@ -114,6 +133,9 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }  static inline void tick_shutdown_broadcast(unsigned int *cpup) { }  static inline void tick_suspend_broadcast(void) { }  static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, +					     u32 freq) { return -ENODEV; }  /*   * Set the periodic handler in non broadcast mode @@ -132,3 +154,10 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)  {  	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);  } + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + +#endif + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680..824109060a3 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,86 +18,17 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> -#include <linux/tick.h>  #include "tick-internal.h" -/* Limit min_delta to a jiffie */ -#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ) - -static int tick_increase_min_delta(struct clock_event_device *dev) -{ -	/* Nothing to do if we already reached the limit */ -	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) -		return -ETIME; - -	if (dev->min_delta_ns < 5000) -		dev->min_delta_ns = 5000; -	else -		dev->min_delta_ns += dev->min_delta_ns >> 1; - -	if (dev->min_delta_ns > MIN_DELTA_LIMIT) -		dev->min_delta_ns = MIN_DELTA_LIMIT; - -	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", -	       dev->name ? dev->name : "?", -	       (unsigned long long) dev->min_delta_ns); -	return 0; -} - -/** - * tick_program_event internal worker function - */ -int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, -			   int force) -{ -	ktime_t now = ktime_get(); -	int i; - -	for (i = 0;;) { -		int ret = clockevents_program_event(dev, expires, now); - -		if (!ret || !force) -			return ret; - -		dev->retries++; -		/* -		 * We tried 3 times to program the device with the given -		 * min_delta_ns. If that's not working then we increase it -		 * and emit a warning. -		 */ -		if (++i > 2) { -			/* Increase the min. delta and try again */ -			if (tick_increase_min_delta(dev)) { -				/* -				 * Get out of the loop if min_delta_ns -				 * hit the limit already. That's -				 * better than staying here forever. -				 * -				 * We clear next_event so we have a -				 * chance that the box survives. -				 */ -				printk(KERN_WARNING -				       "CE: Reprogramming failure. Giving up\n"); -				dev->next_event.tv64 = KTIME_MAX; -				return -ETIME; -			} -			i = 0; -		} - -		now = ktime_get(); -		expires = ktime_add_ns(now, dev->min_delta_ns); -	} -} -  /**   * tick_program_event   */  int tick_program_event(ktime_t expires, int force)  { -	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; +	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); -	return tick_dev_program_event(dev, expires, force); +	return clockevents_program_event(dev, expires, force);  }  /** @@ -105,11 +36,10 @@ int tick_program_event(ktime_t expires, int force)   */  void tick_resume_oneshot(void)  { -	struct tick_device *td = &__get_cpu_var(tick_cpu_device); -	struct clock_event_device *dev = td->evtdev; +	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);  	clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); -	tick_program_event(ktime_get(), 1); +	clockevents_program_event(dev, ktime_get(), true);  }  /** @@ -121,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,  {  	newdev->event_handler = handler;  	clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); -	tick_dev_program_event(newdev, next_event, 1); +	clockevents_program_event(newdev, next_event, true);  }  /** @@ -167,7 +97,7 @@ int tick_oneshot_mode_active(void)  	int ret;  	local_irq_save(flags); -	ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; +	ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;  	local_irq_restore(flags);  	return ret; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,20 +19,25 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> -#include <linux/tick.h>  #include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> +#include <linux/context_tracking.h>  #include <asm/irq_regs.h>  #include "tick-internal.h" +#include <trace/events/timer.h> +  /*   * Per cpu nohz control structure   */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);  /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock.   */  static ktime_t last_jiffies_update; @@ -50,14 +55,14 @@ static void tick_do_update_jiffies64(ktime_t now)  	ktime_t delta;  	/* -	 * Do a quick check without holding xtime_lock: +	 * Do a quick check without holding jiffies_lock:  	 */  	delta = ktime_sub(now, last_jiffies_update);  	if (delta.tv64 < tick_period.tv64)  		return; -	/* Reevalute with xtime_lock held */ -	write_seqlock(&xtime_lock); +	/* Reevalute with jiffies_lock held */ +	write_seqlock(&jiffies_lock);  	delta = ktime_sub(now, last_jiffies_update);  	if (delta.tv64 >= tick_period.tv64) { @@ -79,8 +84,12 @@ static void tick_do_update_jiffies64(ktime_t now)  		/* Keep the tick_next_period variable up to date */  		tick_next_period = ktime_add(last_jiffies_update, tick_period); +	} else { +		write_sequnlock(&jiffies_lock); +		return;  	} -	write_sequnlock(&xtime_lock); +	write_sequnlock(&jiffies_lock); +	update_wall_time();  }  /* @@ -90,24 +99,274 @@ static ktime_t tick_init_jiffy_update(void)  {  	ktime_t period; -	write_seqlock(&xtime_lock); +	write_seqlock(&jiffies_lock);  	/* Did we start the jiffies update yet ? */  	if (last_jiffies_update.tv64 == 0)  		last_jiffies_update = tick_next_period;  	period = last_jiffies_update; -	write_sequnlock(&xtime_lock); +	write_sequnlock(&jiffies_lock);  	return period;  } + +static void tick_sched_do_timer(ktime_t now) +{ +	int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON +	/* +	 * Check if the do_timer duty was dropped. We don't care about +	 * concurrency: This happens only when the cpu in charge went +	 * into a long sleep. If two cpus happen to assign themself to +	 * this duty, then the jiffies update is still serialized by +	 * jiffies_lock. +	 */ +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) +	    && !tick_nohz_full_cpu(cpu)) +		tick_do_timer_cpu = cpu; +#endif + +	/* Check, if the jiffies need an update */ +	if (tick_do_timer_cpu == cpu) +		tick_do_update_jiffies64(now); +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON +	/* +	 * When we are idle and the tick is stopped, we have to touch +	 * the watchdog as we might not schedule for a really long +	 * time. This happens on complete idle SMP systems while +	 * waiting on the login prompt. We also increment the "start of +	 * idle" jiffy stamp so the idle accounting adjustment we do +	 * when we go busy again does not account too much ticks. +	 */ +	if (ts->tick_stopped) { +		touch_softlockup_watchdog(); +		if (is_idle_task(current)) +			ts->idle_jiffies++; +	} +#endif +	update_process_times(user_mode(regs)); +	profile_tick(CPU_PROFILING); +} + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; + +static bool can_stop_full_tick(void) +{ +	WARN_ON_ONCE(!irqs_disabled()); + +	if (!sched_can_stop_tick()) { +		trace_tick_stop(0, "more than 1 task in runqueue\n"); +		return false; +	} + +	if (!posix_cpu_timers_can_stop_tick(current)) { +		trace_tick_stop(0, "posix timers running\n"); +		return false; +	} + +	if (!perf_event_can_stop_tick()) { +		trace_tick_stop(0, "perf events running\n"); +		return false; +	} + +	/* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +	/* +	 * TODO: kick full dynticks CPUs when +	 * sched_clock_stable is set. +	 */ +	if (!sched_clock_stable()) { +		trace_tick_stop(0, "unstable sched clock\n"); +		/* +		 * Don't allow the user to think they can get +		 * full NO_HZ with this machine. +		 */ +		WARN_ONCE(tick_nohz_full_running, +			  "NO_HZ FULL will not work with unstable sched clock"); +		return false; +	} +#endif + +	return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void __tick_nohz_full_check(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (tick_nohz_full_cpu(smp_processor_id())) { +		if (ts->tick_stopped && !is_idle_task(current)) { +			if (!can_stop_full_tick()) +				tick_nohz_restart_sched_tick(ts, ktime_get()); +		} +	} +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ +	__tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { +	.func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ +	if (tick_nohz_full_cpu(smp_processor_id())) +		irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ +	__tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ +	if (!tick_nohz_full_running) +		return; + +	preempt_disable(); +	smp_call_function_many(tick_nohz_full_mask, +			       nohz_full_kick_ipi, NULL, false); +	tick_nohz_full_kick(); +	preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void __tick_nohz_task_switch(struct task_struct *tsk) +{ +	unsigned long flags; + +	local_irq_save(flags); + +	if (!tick_nohz_full_cpu(smp_processor_id())) +		goto out; + +	if (tick_nohz_tick_stopped() && !can_stop_full_tick()) +		tick_nohz_full_kick(); + +out: +	local_irq_restore(flags); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ +	int cpu; + +	alloc_bootmem_cpumask_var(&tick_nohz_full_mask); +	if (cpulist_parse(str, tick_nohz_full_mask) < 0) { +		pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); +		return 1; +	} + +	cpu = smp_processor_id(); +	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { +		pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); +		cpumask_clear_cpu(cpu, tick_nohz_full_mask); +	} +	tick_nohz_full_running = true; + +	return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, +						 unsigned long action, +						 void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		/* +		 * If we handle the timekeeping duty for full dynticks CPUs, +		 * we can't safely shutdown that CPU. +		 */ +		if (tick_nohz_full_running && tick_do_timer_cpu == cpu) +			return NOTIFY_BAD; +		break; +	} +	return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ +	int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL +	if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { +		pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); +		return err; +	} +	err = 0; +	cpumask_setall(tick_nohz_full_mask); +	cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); +	tick_nohz_full_running = true; +#endif +	return err; +} + +void __init tick_nohz_init(void) +{ +	int cpu; + +	if (!tick_nohz_full_running) { +		if (tick_nohz_init_all() < 0) +			return; +	} + +	for_each_cpu(cpu, tick_nohz_full_mask) +		context_tracking_cpu_set(cpu); + +	cpu_notifier(tick_nohz_cpu_down_callback, 0); +	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); +	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#endif +  /*   * NOHZ - aka dynamic tick functionality   */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * NO HZ enabled ?   */  static int tick_nohz_enabled __read_mostly  = 1; - +int tick_nohz_active  __read_mostly;  /*   * Enable / Disable tickless mode   */ @@ -136,12 +395,9 @@ __setup("nohz=", setup_tick_nohz);   */  static void tick_nohz_update_jiffies(ktime_t now)  { -	int cpu = smp_processor_id(); -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);  	unsigned long flags; -	cpumask_clear_cpu(cpu, nohz_cpu_mask); -	ts->idle_waketime = now; +	__this_cpu_write(tick_cpu_sched.idle_waketime, now);  	local_irq_save(flags);  	tick_do_update_jiffies64(now); @@ -160,9 +416,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda  	if (ts->idle_active) {  		delta = ktime_sub(now, ts->idle_entrytime); -		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);  		if (nr_iowait_cpu(cpu) > 0)  			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); +		else +			ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);  		ts->idle_entrytime = now;  	} @@ -171,23 +428,17 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda  } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)  { -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - -	update_ts_time_stats(cpu, ts, now, NULL); +	update_ts_time_stats(smp_processor_id(), ts, now, NULL);  	ts->idle_active = 0;  	sched_clock_idle_wakeup_event(0);  } -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts)  { -	ktime_t now; - -	now = ktime_get(); - -	update_ts_time_stats(cpu, ts, now, NULL); +	ktime_t now = ktime_get();  	ts->idle_entrytime = now;  	ts->idle_active = 1; @@ -198,11 +449,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)  /**   * get_cpu_idle_time_us - get the total idle time of a cpu   * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL.   *   * Return the cummulative idle time (since boot) for a given - * CPU, in microseconds. The idle time returned includes - * the iowait time (unlike what "top" and co report). + * CPU, in microseconds.   *   * This time is measured via accounting rather than sampling,   * and is as accurate as ktime_get() is. @@ -212,20 +463,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)  u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	ktime_t now, idle; -	if (!tick_nohz_enabled) +	if (!tick_nohz_active)  		return -1; -	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); +	now = ktime_get(); +	if (last_update_time) { +		update_ts_time_stats(cpu, ts, now, last_update_time); +		idle = ts->idle_sleeptime; +	} else { +		if (ts->idle_active && !nr_iowait_cpu(cpu)) { +			ktime_t delta = ktime_sub(now, ts->idle_entrytime); + +			idle = ktime_add(ts->idle_sleeptime, delta); +		} else { +			idle = ts->idle_sleeptime; +		} +	} + +	return ktime_to_us(idle); -	return ktime_to_us(ts->idle_sleeptime);  }  EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); -/* +/**   * get_cpu_iowait_time_us - get the total iowait time of a cpu   * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL.   *   * Return the cummulative iowait time (since boot) for a given   * CPU, in microseconds. @@ -238,106 +504,66 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);  u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	ktime_t now, iowait; -	if (!tick_nohz_enabled) +	if (!tick_nohz_active)  		return -1; -	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); +	now = ktime_get(); +	if (last_update_time) { +		update_ts_time_stats(cpu, ts, now, last_update_time); +		iowait = ts->iowait_sleeptime; +	} else { +		if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { +			ktime_t delta = ktime_sub(now, ts->idle_entrytime); + +			iowait = ktime_add(ts->iowait_sleeptime, delta); +		} else { +			iowait = ts->iowait_sleeptime; +		} +	} -	return ktime_to_us(ts->iowait_sleeptime); +	return ktime_to_us(iowait);  }  EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, +					 ktime_t now, int cpu)  { -	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; -	struct tick_sched *ts; -	ktime_t last_update, expires, now; +	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; +	ktime_t last_update, expires, ret = { .tv64 = 0 }; +	unsigned long rcu_delta_jiffies;  	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;  	u64 time_delta; -	int cpu; - -	local_irq_save(flags); - -	cpu = smp_processor_id(); -	ts = &per_cpu(tick_cpu_sched, cpu); - -	/* -	 * Call to tick_nohz_start_idle stops the last_update_time from being -	 * updated. Thus, it must not be called in the event we are called from -	 * irq_exit() with the prior state different than idle. -	 */ -	if (!inidle && !ts->inidle) -		goto end; - -	/* -	 * Set ts->inidle unconditionally. Even if the system did not -	 * switch to NOHZ mode the cpu frequency governers rely on the -	 * update of the idle time accounting in tick_nohz_start_idle(). -	 */ -	ts->inidle = 1; - -	now = tick_nohz_start_idle(cpu, ts); - -	/* -	 * If this cpu is offline and it is the one which updates -	 * jiffies, then give up the assignment and let it be taken by -	 * the cpu which runs the tick timer next. If we don't drop -	 * this here the jiffies might be stale and do_timer() never -	 * invoked. -	 */ -	if (unlikely(!cpu_online(cpu))) { -		if (cpu == tick_do_timer_cpu) -			tick_do_timer_cpu = TICK_DO_TIMER_NONE; -	} - -	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -		goto end; - -	if (need_resched()) -		goto end; - -	if (unlikely(local_softirq_pending() && cpu_online(cpu))) { -		static int ratelimit; -		if (ratelimit < 10) { -			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", -			       (unsigned int) local_softirq_pending()); -			ratelimit++; -		} -		goto end; -	} +	time_delta = timekeeping_max_deferment(); -	ts->idle_calls++;  	/* Read jiffies and the time when jiffies were updated last */  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&jiffies_lock);  		last_update = last_jiffies_update;  		last_jiffies = jiffies; -		time_delta = timekeeping_max_deferment(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&jiffies_lock, seq)); -	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || -	    arch_needs_cpu(cpu)) { +	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || +	    arch_needs_cpu(cpu) || irq_work_needs_cpu()) {  		next_jiffies = last_jiffies + 1;  		delta_jiffies = 1;  	} else {  		/* Get the next timer wheel timer */  		next_jiffies = get_next_timer_interrupt(last_jiffies);  		delta_jiffies = next_jiffies - last_jiffies; +		if (rcu_delta_jiffies < delta_jiffies) { +			next_jiffies = last_jiffies + rcu_delta_jiffies; +			delta_jiffies = rcu_delta_jiffies; +		}  	} +  	/* -	 * Do not stop the tick, if we are only one off -	 * or if the cpu is required for rcu +	 * Do not stop the tick, if we are only one off (or less) +	 * or if the cpu is required for RCU:  	 */ -	if (!ts->tick_stopped && delta_jiffies == 1) +	if (!ts->tick_stopped && delta_jiffies <= 1)  		goto out;  	/* Schedule the tick, if we are at least one jiffie off */ @@ -366,6 +592,13 @@ void tick_nohz_stop_sched_tick(int inidle)  			time_delta = KTIME_MAX;  		} +#ifdef CONFIG_NO_HZ_FULL +		if (!ts->inidle) { +			time_delta = min(time_delta, +					 scheduler_tick_max_deferment()); +		} +#endif +  		/*  		 * calculate the expiry time for the next timer wheel  		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -390,13 +623,12 @@ void tick_nohz_stop_sched_tick(int inidle)  		else  			expires.tv64 = KTIME_MAX; -		if (delta_jiffies > 1) -			cpumask_set_cpu(cpu, nohz_cpu_mask); -  		/* Skip reprogram of event if its not changed */  		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))  			goto out; +		ret = expires; +  		/*  		 * nohz_stop_sched_tick can be called several times before  		 * the nohz_restart_sched_tick is called. This happens when @@ -405,19 +637,14 @@ void tick_nohz_stop_sched_tick(int inidle)  		 * the scheduler tick in nohz_restart_sched_tick.  		 */  		if (!ts->tick_stopped) { -			select_nohz_load_balancer(1); +			nohz_balance_enter_idle(cpu); +			calc_load_enter_idle(); -			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); +			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);  			ts->tick_stopped = 1; -			ts->idle_jiffies = last_jiffies; -			rcu_enter_nohz(); +			trace_tick_stop(1, " ");  		} -		ts->idle_sleeps++; - -		/* Mark expires */ -		ts->idle_expires = expires; -  		/*  		 * If the expiration time == KTIME_MAX, then  		 * in this case we simply stop the tick timer. @@ -442,15 +669,162 @@ void tick_nohz_stop_sched_tick(int inidle)  		 * softirq.  		 */  		tick_do_update_jiffies64(ktime_get()); -		cpumask_clear_cpu(cpu, nohz_cpu_mask);  	}  	raise_softirq_irqoff(TIMER_SOFTIRQ);  out:  	ts->next_jiffies = next_jiffies;  	ts->last_jiffies = last_jiffies;  	ts->sleep_length = ktime_sub(dev->next_event, now); -end: -	local_irq_restore(flags); + +	return ret; +} + +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL +	int cpu = smp_processor_id(); + +	if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) +		return; + +	if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) +		return; + +	if (!can_stop_full_tick()) +		return; + +	tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ +	/* +	 * If this cpu is offline and it is the one which updates +	 * jiffies, then give up the assignment and let it be taken by +	 * the cpu which runs the tick timer next. If we don't drop +	 * this here the jiffies might be stale and do_timer() never +	 * invoked. +	 */ +	if (unlikely(!cpu_online(cpu))) { +		if (cpu == tick_do_timer_cpu) +			tick_do_timer_cpu = TICK_DO_TIMER_NONE; +		return false; +	} + +	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { +		ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; +		return false; +	} + +	if (need_resched()) +		return false; + +	if (unlikely(local_softirq_pending() && cpu_online(cpu))) { +		static int ratelimit; + +		if (ratelimit < 10 && +		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { +			pr_warn("NOHZ: local_softirq_pending %02x\n", +				(unsigned int) local_softirq_pending()); +			ratelimit++; +		} +		return false; +	} + +	if (tick_nohz_full_enabled()) { +		/* +		 * Keep the tick alive to guarantee timekeeping progression +		 * if there are full dynticks CPUs around +		 */ +		if (tick_do_timer_cpu == cpu) +			return false; +		/* +		 * Boot safety: make sure the timekeeping duty has been +		 * assigned before entering dyntick-idle mode, +		 */ +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +			return false; +	} + +	return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ +	ktime_t now, expires; +	int cpu = smp_processor_id(); + +	now = tick_nohz_start_idle(ts); + +	if (can_stop_idle_tick(cpu, ts)) { +		int was_stopped = ts->tick_stopped; + +		ts->idle_calls++; + +		expires = tick_nohz_stop_sched_tick(ts, now, cpu); +		if (expires.tv64 > 0LL) { +			ts->idle_sleeps++; +			ts->idle_expires = expires; +		} + +		if (!was_stopped && ts->tick_stopped) +			ts->idle_jiffies = ts->last_jiffies; +	} +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * + * The arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + *  to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + */ +void tick_nohz_idle_enter(void) +{ +	struct tick_sched *ts; + +	WARN_ON_ONCE(irqs_disabled()); + +	/* + 	 * Update the idle state in the scheduler domain hierarchy + 	 * when tick_nohz_stop_sched_tick() is called from the idle loop. + 	 * State will be updated to busy during the first busy tick after + 	 * exiting idle. + 	 */ +	set_cpu_sd_state_idle(); + +	local_irq_disable(); + +	ts = &__get_cpu_var(tick_cpu_sched); +	ts->inidle = 1; +	__tick_nohz_idle_enter(ts); + +	local_irq_enable(); +} +EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (ts->inidle) +		__tick_nohz_idle_enter(ts); +	else +		tick_nohz_full_stop_tick(ts);  }  /** @@ -468,7 +842,7 @@ ktime_t tick_nohz_get_sleep_length(void)  static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)  {  	hrtimer_cancel(&ts->sched_timer); -	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); +	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);  	while (1) {  		/* Forward the time to expire in the future */ @@ -485,49 +859,36 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)  				hrtimer_get_expires(&ts->sched_timer), 0))  				break;  		} -		/* Update jiffies and reread time */ -		tick_do_update_jiffies64(now); +		/* Reread time and update jiffies */  		now = ktime_get(); +		tick_do_update_jiffies64(now);  	}  } -/** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - */ -void tick_nohz_restart_sched_tick(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  { -	int cpu = smp_processor_id(); -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -	unsigned long ticks; -#endif -	ktime_t now; - -	local_irq_disable(); -	if (ts->idle_active || (ts->inidle && ts->tick_stopped)) -		now = ktime_get(); - -	if (ts->idle_active) -		tick_nohz_stop_idle(cpu, now); - -	if (!ts->inidle || !ts->tick_stopped) { -		ts->inidle = 0; -		local_irq_enable(); -		return; -	} +	/* Update jiffies first */ +	tick_do_update_jiffies64(now); +	update_cpu_load_nohz(); -	ts->inidle = 0; +	calc_load_exit_idle(); +	touch_softlockup_watchdog(); +	/* +	 * Cancel the scheduled timer and restore the tick +	 */ +	ts->tick_stopped  = 0; +	ts->idle_exittime = now; -	rcu_exit_nohz(); +	tick_nohz_restart(ts, now); +} -	/* Update jiffies first */ -	select_nohz_load_balancer(0); -	tick_do_update_jiffies64(now); -	cpumask_clear_cpu(cpu, nohz_cpu_mask); +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +	unsigned long ticks; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +	if (vtime_accounting_enabled()) +		return;  	/*  	 * We stopped the tick in idle. Update process times would miss the  	 * time we slept as update_process_times does only a 1 tick @@ -540,18 +901,40 @@ void tick_nohz_restart_sched_tick(void)  	if (ticks && ticks < LONG_MAX)  		account_idle_ticks(ticks);  #endif +} -	touch_softlockup_watchdog(); -	/* -	 * Cancel the scheduled timer and restore the tick -	 */ -	ts->tick_stopped  = 0; -	ts->idle_exittime = now; +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); +	ktime_t now; -	tick_nohz_restart(ts, now); +	local_irq_disable(); + +	WARN_ON_ONCE(!ts->inidle); + +	ts->inidle = 0; + +	if (ts->idle_active || ts->tick_stopped) +		now = ktime_get(); + +	if (ts->idle_active) +		tick_nohz_stop_idle(ts, now); + +	if (ts->tick_stopped) { +		tick_nohz_restart_sched_tick(ts, now); +		tick_nohz_account_idle_ticks(ts); +	}  	local_irq_enable();  } +EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)  { @@ -566,40 +949,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)  {  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);  	struct pt_regs *regs = get_irq_regs(); -	int cpu = smp_processor_id();  	ktime_t now = ktime_get();  	dev->next_event.tv64 = KTIME_MAX; -	/* -	 * Check if the do_timer duty was dropped. We don't care about -	 * concurrency: This happens only when the cpu in charge went -	 * into a long sleep. If two cpus happen to assign themself to -	 * this duty, then the jiffies update is still serialized by -	 * xtime_lock. -	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) -		tick_do_timer_cpu = cpu; - -	/* Check, if the jiffies need an update */ -	if (tick_do_timer_cpu == cpu) -		tick_do_update_jiffies64(now); - -	/* -	 * When we are idle and the tick is stopped, we have to touch -	 * the watchdog as we might not schedule for a really long -	 * time. This happens on complete idle SMP systems while -	 * waiting on the login prompt. We also increment the "start -	 * of idle" jiffy stamp so the idle accounting adjustment we -	 * do when we go busy again does not account too much ticks. -	 */ -	if (ts->tick_stopped) { -		touch_softlockup_watchdog(); -		ts->idle_jiffies++; -	} - -	update_process_times(user_mode(regs)); -	profile_tick(CPU_PROFILING); +	tick_sched_do_timer(now); +	tick_sched_handle(ts, regs);  	while (tick_nohz_reprogram(ts, now)) {  		now = ktime_get(); @@ -623,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)  		local_irq_enable();  		return;  	} - +	tick_nohz_active = 1;  	ts->nohz_mode = NOHZ_MODE_LOWRES;  	/* @@ -641,9 +996,6 @@ static void tick_nohz_switch_to_nohz(void)  		next = ktime_add(next, tick_period);  	}  	local_irq_enable(); - -	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", -	       smp_processor_id());  }  /* @@ -657,12 +1009,10 @@ static void tick_nohz_switch_to_nohz(void)   * timer and do not touch the other magic bits which need to be done   * when idle is left.   */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)  {  #if 0  	/* Switch back to 2.6.27 behaviour */ - -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);  	ktime_t delta;  	/* @@ -677,36 +1027,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)  #endif  } -static inline void tick_check_nohz(int cpu) +static inline void tick_nohz_irq_enter(void)  { -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);  	ktime_t now;  	if (!ts->idle_active && !ts->tick_stopped)  		return;  	now = ktime_get();  	if (ts->idle_active) -		tick_nohz_stop_idle(cpu, now); +		tick_nohz_stop_idle(ts, now);  	if (ts->tick_stopped) {  		tick_nohz_update_jiffies(now); -		tick_nohz_kick_tick(cpu, now); +		tick_nohz_kick_tick(ts, now);  	}  }  #else  static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_nohz_irq_enter(void) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * Called from irq_enter to notify about the possible interruption of idle()   */ -void tick_check_idle(int cpu) +void tick_irq_enter(void)  { -	tick_check_oneshot_broadcast(cpu); -	tick_check_nohz(cpu); +	tick_check_oneshot_broadcast_this_cpu(); +	tick_nohz_irq_enter();  }  /* @@ -715,7 +1065,7 @@ void tick_check_idle(int cpu)  #ifdef CONFIG_HIGH_RES_TIMERS  /*   * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled.   */  static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  { @@ -723,50 +1073,31 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  		container_of(timer, struct tick_sched, sched_timer);  	struct pt_regs *regs = get_irq_regs();  	ktime_t now = ktime_get(); -	int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ -	/* -	 * Check if the do_timer duty was dropped. We don't care about -	 * concurrency: This happens only when the cpu in charge went -	 * into a long sleep. If two cpus happen to assign themself to -	 * this duty, then the jiffies update is still serialized by -	 * xtime_lock. -	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) -		tick_do_timer_cpu = cpu; -#endif - -	/* Check, if the jiffies need an update */ -	if (tick_do_timer_cpu == cpu) -		tick_do_update_jiffies64(now); +	tick_sched_do_timer(now);  	/*  	 * Do not call, when we are not in irq context and have  	 * no valid regs pointer  	 */ -	if (regs) { -		/* -		 * When we are idle and the tick is stopped, we have to touch -		 * the watchdog as we might not schedule for a really long -		 * time. This happens on complete idle SMP systems while -		 * waiting on the login prompt. We also increment the "start of -		 * idle" jiffy stamp so the idle accounting adjustment we do -		 * when we go busy again does not account too much ticks. -		 */ -		if (ts->tick_stopped) { -			touch_softlockup_watchdog(); -			ts->idle_jiffies++; -		} -		update_process_times(user_mode(regs)); -		profile_tick(CPU_PROFILING); -	} +	if (regs) +		tick_sched_handle(ts, regs);  	hrtimer_forward(timer, now, tick_period);  	return HRTIMER_RESTART;  } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ +	get_option(&str, &sched_skew_tick); + +	return 0; +} +early_param("skew_tick", skew_tick); +  /**   * tick_setup_sched_timer - setup the tick emulation timer   */ @@ -784,6 +1115,14 @@ void tick_setup_sched_timer(void)  	/* Get the next period (per cpu) */  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); +	/* Offset the tick to avert jiffies_lock contention. */ +	if (sched_skew_tick) { +		u64 offset = ktime_to_ns(tick_period) >> 1; +		do_div(offset, num_possible_cpus()); +		offset *= smp_processor_id(); +		hrtimer_add_expires_ns(&ts->sched_timer, offset); +	} +  	for (;;) {  		hrtimer_forward(&ts->sched_timer, now, tick_period);  		hrtimer_start_expires(&ts->sched_timer, @@ -794,14 +1133,16 @@ void tick_setup_sched_timer(void)  		now = ktime_get();  	} -#ifdef CONFIG_NO_HZ -	if (tick_nohz_enabled) +#ifdef CONFIG_NO_HZ_COMMON +	if (tick_nohz_enabled) {  		ts->nohz_mode = NOHZ_MODE_HIGHRES; +		tick_nohz_active = 1; +	}  #endif  }  #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS  void tick_cancel_sched_timer(int cpu)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -811,7 +1152,7 @@ void tick_cancel_sched_timer(int cpu)  		hrtimer_cancel(&ts->sched_timer);  # endif -	ts->nohz_mode = NOHZ_MODE_INACTIVE; +	memset(ts, 0, sizeof(*ts));  }  #endif diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index ac38fbb176c..00000000000 --- a/kernel/time/timecompare.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly <patrick.ohly@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/timecompare.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/math64.h> - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, -			      u64 source_tstamp) -{ -	u64 nsec; - -	nsec = source_tstamp + sync->offset; -	nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / -		TIMECOMPARE_SKEW_RESOLUTION; - -	return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, -		       s64 *offset, -		       u64 *source_tstamp) -{ -	u64 start_source = 0, end_source = 0; -	struct { -		s64 offset; -		s64 duration_target; -	} buffer[10], sample, *samples; -	int counter = 0, i; -	int used; -	int index; -	int num_samples = sync->num_samples; - -	if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { -		samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); -		if (!samples) { -			samples = buffer; -			num_samples = sizeof(buffer)/sizeof(buffer[0]); -		} -	} else { -		samples = buffer; -	} - -	/* run until we have enough valid samples, but do not try forever */ -	i = 0; -	counter = 0; -	while (1) { -		u64 ts; -		ktime_t start, end; - -		start = sync->target(); -		ts = timecounter_read(sync->source); -		end = sync->target(); - -		if (!i) -			start_source = ts; - -		/* ignore negative durations */ -		sample.duration_target = ktime_to_ns(ktime_sub(end, start)); -		if (sample.duration_target >= 0) { -			/* -			 * assume symetric delay to and from source: -			 * average target time corresponds to measured -			 * source time -			 */ -			sample.offset = -				(ktime_to_ns(end) + ktime_to_ns(start)) / 2 - -				ts; - -			/* simple insertion sort based on duration */ -			index = counter - 1; -			while (index >= 0) { -				if (samples[index].duration_target < -				    sample.duration_target) -					break; -				samples[index + 1] = samples[index]; -				index--; -			} -			samples[index + 1] = sample; -			counter++; -		} - -		i++; -		if (counter >= num_samples || i >= 100000) { -			end_source = ts; -			break; -		} -	} - -	*source_tstamp = (end_source + start_source) / 2; - -	/* remove outliers by only using 75% of the samples */ -	used = counter * 3 / 4; -	if (!used) -		used = counter; -	if (used) { -		/* calculate average */ -		s64 off = 0; -		for (index = 0; index < used; index++) -			off += samples[index].offset; -		*offset = div_s64(off, used); -	} - -	if (samples && samples != buffer) -		kfree(samples); - -	return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, -			  u64 source_tstamp) -{ -	s64 offset; -	u64 average_time; - -	if (!timecompare_offset(sync, &offset, &average_time)) -		return; - -	if (!sync->last_update) { -		sync->last_update = average_time; -		sync->offset = offset; -		sync->skew = 0; -	} else { -		s64 delta_nsec = average_time - sync->last_update; - -		/* avoid division by negative or small deltas */ -		if (delta_nsec >= 10000) { -			s64 delta_offset_nsec = offset - sync->offset; -			s64 skew; /* delta_offset_nsec * -				     TIMECOMPARE_SKEW_RESOLUTION / -				     delta_nsec */ -			u64 divisor; - -			/* div_s64() is limited to 32 bit divisor */ -			skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; -			divisor = delta_nsec; -			while (unlikely(divisor >= ((s64)1) << 32)) { -				/* divide both by 2; beware, right shift -				   of negative value has undefined -				   behavior and can only be used for -				   the positive divisor */ -				skew = div_s64(skew, 2); -				divisor >>= 1; -			} -			skew = div_s64(skew, divisor); - -			/* -			 * Calculate new overall skew as 4/16 the -			 * old value and 12/16 the new one. This is -			 * a rather arbitrary tradeoff between -			 * only using the latest measurement (0/16 and -			 * 16/16) and even more weight on past measurements. -			 */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 -			sync->skew = -				div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * -					sync->skew + -					TIMECOMPARE_NEW_SKEW_PER_16 * skew, -					16); -			sync->last_update = average_time; -			sync->offset = offset; -		} -	} -} -EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f7..32d8d6aaedb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,50 +8,92 @@   *   */ +#include <linux/timekeeper_internal.h>  #include <linux/module.h>  #include <linux/interrupt.h>  #include <linux/percpu.h>  #include <linux/init.h>  #include <linux/mm.h>  #include <linux/sched.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h>  #include <linux/clocksource.h>  #include <linux/jiffies.h>  #include <linux/time.h>  #include <linux/tick.h>  #include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> +#include <linux/compiler.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { -	/* Current clocksource used for timekeeping. */ -	struct clocksource *clock; -	/* The shift value of the current clocksource. */ -	int	shift; - -	/* Number of clock cycles in one NTP interval. */ -	cycle_t cycle_interval; -	/* Number of clock shifted nano seconds in one NTP interval. */ -	u64	xtime_interval; -	/* Raw nano seconds accumulated per NTP interval. */ -	u32	raw_interval; - -	/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ -	u64	xtime_nsec; -	/* Difference between accumulated time and NTP time in ntp -	 * shifted nano seconds. */ -	s64	ntp_error; -	/* Shift conversion between clock shifted nano seconds and -	 * ntp shifted nano seconds. */ -	int	ntp_error_shift; -	/* NTP adjusted clock multiplier */ -	u32	mult; -}; +#include "tick-internal.h" +#include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP		(1 << 0) +#define TK_MIRROR		(1 << 1) +#define TK_CLOCK_WAS_SET	(1 << 2) + +static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; -struct timekeeper timekeeper; +/* Flag for if there is a persistent clock on this platform */ +bool __read_mostly persistent_clock_exist = false; + +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ +	while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { +		tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; +		tk->xtime_sec++; +	} +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +{ +	tk->xtime_sec = ts->tv_sec; +	tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +{ +	tk->xtime_sec += ts->tv_sec; +	tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; +	tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ +	struct timespec tmp; + +	/* +	 * Verify consistency of: offset_real = -wall_to_monotonic +	 * before modifying anything +	 */ +	set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, +					-tk->wall_to_monotonic.tv_nsec); +	WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); +	tk->wall_to_monotonic = wtm; +	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); +	tk->offs_real = timespec_to_ktime(tmp); +	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ +	/* Verify consistency before modifying */ +	WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + +	tk->total_sleep_time	= t; +	tk->offs_boot		= timespec_to_ktime(t); +}  /** - * timekeeper_setup_internals - Set up internals to use clocksource clock. + * tk_setup_internals - Set up internals to use clocksource clock.   * + * @tk:		The target timekeeper to setup.   * @clock:		Pointer to clocksource.   *   * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment @@ -59,119 +101,164 @@ struct timekeeper timekeeper;   *   * Unless you're the timekeeping code, you should not be using this!   */ -static void timekeeper_setup_internals(struct clocksource *clock) +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  {  	cycle_t interval; -	u64 tmp; +	u64 tmp, ntpinterval; +	struct clocksource *old_clock; -	timekeeper.clock = clock; -	clock->cycle_last = clock->read(clock); +	old_clock = tk->clock; +	tk->clock = clock; +	tk->cycle_last = clock->cycle_last = clock->read(clock);  	/* Do the ns -> cycle conversion first, using original mult */  	tmp = NTP_INTERVAL_LENGTH;  	tmp <<= clock->shift; +	ntpinterval = tmp;  	tmp += clock->mult/2;  	do_div(tmp, clock->mult);  	if (tmp == 0)  		tmp = 1;  	interval = (cycle_t) tmp; -	timekeeper.cycle_interval = interval; +	tk->cycle_interval = interval;  	/* Go back from cycles -> shifted ns */ -	timekeeper.xtime_interval = (u64) interval * clock->mult; -	timekeeper.raw_interval = +	tk->xtime_interval = (u64) interval * clock->mult; +	tk->xtime_remainder = ntpinterval - tk->xtime_interval; +	tk->raw_interval =  		((u64) interval * clock->mult) >> clock->shift; -	timekeeper.xtime_nsec = 0; -	timekeeper.shift = clock->shift; +	 /* if changing clocks, convert xtime_nsec shift units */ +	if (old_clock) { +		int shift_change = clock->shift - old_clock->shift; +		if (shift_change < 0) +			tk->xtime_nsec >>= -shift_change; +		else +			tk->xtime_nsec <<= shift_change; +	} +	tk->shift = clock->shift; -	timekeeper.ntp_error = 0; -	timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; +	tk->ntp_error = 0; +	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;  	/*  	 * The timekeeper keeps its own mult values for the currently  	 * active clocksource. These value will be adjusted via NTP  	 * to counteract clock drifting.  	 */ -	timekeeper.mult = clock->mult; +	tk->mult = clock->mult;  }  /* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ +	if (likely(arch_gettimeoffset)) +		return arch_gettimeoffset(); +	return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif + +static inline s64 timekeeping_get_ns(struct timekeeper *tk)  {  	cycle_t cycle_now, cycle_delta;  	struct clocksource *clock; +	s64 nsec;  	/* read clocksource: */ -	clock = timekeeper.clock; +	clock = tk->clock;  	cycle_now = clock->read(clock);  	/* calculate the delta since the last update_wall_time: */  	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	/* return delta convert to nanoseconds using ntp adjusted mult. */ -	return clocksource_cyc2ns(cycle_delta, timekeeper.mult, -				  timekeeper.shift); +	nsec = cycle_delta * tk->mult + tk->xtime_nsec; +	nsec >>= tk->shift; + +	/* If arch requires, add in get_arch_timeoffset() */ +	return nsec + get_arch_timeoffset();  } -static inline s64 timekeeping_get_ns_raw(void) +static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  {  	cycle_t cycle_now, cycle_delta;  	struct clocksource *clock; +	s64 nsec;  	/* read clocksource: */ -	clock = timekeeper.clock; +	clock = tk->clock;  	cycle_now = clock->read(clock);  	/* calculate the delta since the last update_wall_time: */  	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	/* return delta convert to nanoseconds using ntp adjusted mult. */ -	return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +	/* convert delta to nanoseconds. */ +	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + +	/* If arch requires, add in get_arch_timeoffset() */ +	return nsec + get_arch_timeoffset();  } -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) +{ +	raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); +} -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time.  Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener   */ -static struct timespec xtime __attribute__ ((aligned (16))); -static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; +	int ret; -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); +	update_pvclock_gtod(tk, true); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener   */ -struct timespec raw_time; +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ +	unsigned long flags; +	int ret; -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold xtime_lock */ -void timekeeping_leap_insert(int leapsecond) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, unsigned int action)  { -	xtime.tv_sec += leapsecond; -	wall_to_monotonic.tv_sec -= leapsecond; -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -			timekeeper.mult); +	if (action & TK_CLEAR_NTP) { +		tk->ntp_error = 0; +		ntp_clear(); +	} +	update_vsyscall(tk); +	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + +	if (action & TK_MIRROR) +		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));  }  /** @@ -181,72 +268,88 @@ void timekeeping_leap_insert(int leapsecond)   * update_wall_time(). This is useful before significant clock changes,   * as it avoids having to deal with this time offset explicitly.   */ -static void timekeeping_forward_now(void) +static void timekeeping_forward_now(struct timekeeper *tk)  {  	cycle_t cycle_now, cycle_delta;  	struct clocksource *clock;  	s64 nsec; -	clock = timekeeper.clock; +	clock = tk->clock;  	cycle_now = clock->read(clock);  	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	clock->cycle_last = cycle_now; +	tk->cycle_last = clock->cycle_last = cycle_now; -	nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, -				  timekeeper.shift); +	tk->xtime_nsec += cycle_delta * tk->mult; -	/* If arch requires, add in gettimeoffset() */ -	nsec += arch_gettimeoffset(); +	/* If arch requires, add in get_arch_timeoffset() */ +	tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; -	timespec_add_ns(&xtime, nsec); +	tk_normalize_xtime(tk);  	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); -	timespec_add_ns(&raw_time, nsec); +	timespec_add_ns(&tk->raw_time, nsec);  }  /** - * getnstimeofday - Returns the time of day in a timespec + * __getnstimeofday - Returns the time of day in a timespec.   * @ts:		pointer to the timespec to be set   * - * Returns the time of day in a timespec. + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined).   */ -void getnstimeofday(struct timespec *ts) +int __getnstimeofday(struct timespec *ts)  { +	struct timekeeper *tk = &timekeeper;  	unsigned long seq; -	s64 nsecs; - -	WARN_ON(timekeeping_suspended); +	s64 nsecs = 0;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqcount_begin(&timekeeper_seq); -		*ts = xtime; -		nsecs = timekeeping_get_ns(); +		ts->tv_sec = tk->xtime_sec; +		nsecs = timekeeping_get_ns(tk); -		/* If arch requires, add in gettimeoffset() */ -		nsecs += arch_gettimeoffset(); - -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	ts->tv_nsec = 0;  	timespec_add_ns(ts, nsecs); + +	/* +	 * Do not bail out early, in case there were callers still using +	 * the value, even in the face of the WARN_ON. +	 */ +	if (unlikely(timekeeping_suspended)) +		return -EAGAIN; +	return 0;  } +EXPORT_SYMBOL(__getnstimeofday); +/** + * getnstimeofday - Returns the time of day in a timespec. + * @ts:		pointer to the timespec to be set + * + * Returns the time of day in a timespec (WARN if suspended). + */ +void getnstimeofday(struct timespec *ts) +{ +	WARN_ON(__getnstimeofday(ts)); +}  EXPORT_SYMBOL(getnstimeofday);  ktime_t ktime_get(void)  { +	struct timekeeper *tk = &timekeeper;  	unsigned int seq;  	s64 secs, nsecs;  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); -		secs = xtime.tv_sec + wall_to_monotonic.tv_sec; -		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; -		nsecs += timekeeping_get_ns(); +		seq = read_seqcount_begin(&timekeeper_seq); +		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; +		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	/*  	 * Use ktime_set/ktime_add_ns to create a proper ktime on  	 * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -265,25 +368,109 @@ EXPORT_SYMBOL_GPL(ktime_get);   */  void ktime_get_ts(struct timespec *ts)  { +	struct timekeeper *tk = &timekeeper;  	struct timespec tomono; +	s64 nsec;  	unsigned int seq; -	s64 nsecs;  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); -		*ts = xtime; -		tomono = wall_to_monotonic; -		nsecs = timekeeping_get_ns(); +		seq = read_seqcount_begin(&timekeeper_seq); +		ts->tv_sec = tk->xtime_sec; +		nsec = timekeeping_get_ns(tk); +		tomono = tk->wall_to_monotonic; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq)); -	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, -				ts->tv_nsec + tomono.tv_nsec + nsecs); +	ts->tv_sec += tomono.tv_sec; +	ts->tv_nsec = 0; +	timespec_add_ns(ts, nsec + tomono.tv_nsec);  }  EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts:		pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; +	u64 nsecs; + +	WARN_ON(timekeeping_suspended); + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); + +		ts->tv_sec = tk->xtime_sec + tk->tai_offset; +		nsecs = timekeeping_get_ns(tk); + +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	ts->tv_nsec = 0; +	timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ +	struct timespec ts; + +	timekeeping_clocktai(&ts); +	return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw:	pointer to the timespec to be set to raw monotonic time + * @ts_real:	pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; +	s64 nsecs_raw, nsecs_real; + +	WARN_ON_ONCE(timekeeping_suspended); + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); + +		*ts_raw = tk->raw_time; +		ts_real->tv_sec = tk->xtime_sec; +		ts_real->tv_nsec = 0; + +		nsecs_raw = timekeeping_get_ns_raw(tk); +		nsecs_real = timekeeping_get_ns(tk); + +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	timespec_add_ns(ts_raw, nsecs_raw); +	timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ +  /**   * do_gettimeofday - Returns the time of day in a timeval   * @tv:		pointer to the timeval to be set @@ -298,66 +485,176 @@ void do_gettimeofday(struct timeval *tv)  	tv->tv_sec = now.tv_sec;  	tv->tv_usec = now.tv_nsec/1000;  } -  EXPORT_SYMBOL(do_gettimeofday); +  /**   * do_settimeofday - Sets the time of day   * @tv:		pointer to the timespec variable containing the new time   *   * Sets the time of day to the new time and update NTP and notify hrtimers   */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv)  { -	struct timespec ts_delta; +	struct timekeeper *tk = &timekeeper; +	struct timespec ts_delta, xt;  	unsigned long flags; -	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) +	if (!timespec_valid_strict(tv))  		return -EINVAL; -	write_seqlock_irqsave(&xtime_lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); -	timekeeping_forward_now(); +	timekeeping_forward_now(tk); -	ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; -	ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; -	wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); +	xt = tk_xtime(tk); +	ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; +	ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; -	xtime = *tv; +	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); -	timekeeper.ntp_error = 0; -	ntp_clear(); +	tk_set_xtime(tk, tv); -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */  	clock_was_set();  	return 0;  } -  EXPORT_SYMBOL(do_settimeofday);  /** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv:		pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; +	struct timespec tmp; +	int ret = 0; + +	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) +		return -EINVAL; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	timekeeping_forward_now(tk); + +	/* Make sure the proposed value is valid */ +	tmp = timespec_add(tk_xtime(tk),  *ts); +	if (!timespec_valid_strict(&tmp)) { +		ret = -EINVAL; +		goto error; +	} + +	tk_xtime_add(tk, ts); +	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + +error: /* even if we error out, we forwarded the time, so call update */ +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	/* signal hrtimers about time change */ +	clock_was_set(); + +	return ret; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned int seq; +	s32 ret; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); +		ret = tk->tai_offset; +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ +	tk->tai_offset = tai_offset; +	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); +	__timekeeping_set_tai_offset(tk, tai_offset); +	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +	clock_was_set(); +} + +/**   * change_clocksource - Swaps clocksources if a new one is available   *   * Accumulates current time interval and initializes new clocksource   */  static int change_clocksource(void *data)  { +	struct timekeeper *tk = &timekeeper;  	struct clocksource *new, *old; +	unsigned long flags;  	new = (struct clocksource *) data; -	timekeeping_forward_now(); -	if (!new->enable || new->enable(new) == 0) { -		old = timekeeper.clock; -		timekeeper_setup_internals(new); -		if (old->disable) -			old->disable(old); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	timekeeping_forward_now(tk); +	/* +	 * If the cs is in module, get a module reference. Succeeds +	 * for built-in code (owner == NULL) as well. +	 */ +	if (try_module_get(new->owner)) { +		if (!new->enable || new->enable(new) == 0) { +			old = tk->clock; +			tk_setup_internals(tk, new); +			if (old->disable) +				old->disable(old); +			module_put(old->owner); +		} else { +			module_put(new->owner); +		}  	} +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +  	return 0;  } @@ -368,12 +665,15 @@ static int change_clocksource(void *data)   * This function is called from clocksource.c after a new, better clock   * source has been registered. The caller holds the clocksource_mutex.   */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock)  { -	if (timekeeper.clock == clock) -		return; +	struct timekeeper *tk = &timekeeper; + +	if (tk->clock == clock) +		return 0;  	stop_machine(change_clocksource, clock, NULL);  	tick_clock_notify(); +	return tk->clock == clock ? 0 : -1;  }  /** @@ -399,48 +699,57 @@ EXPORT_SYMBOL_GPL(ktime_get_real);   */  void getrawmonotonic(struct timespec *ts)  { +	struct timekeeper *tk = &timekeeper;  	unsigned long seq;  	s64 nsecs;  	do { -		seq = read_seqbegin(&xtime_lock); -		nsecs = timekeeping_get_ns_raw(); -		*ts = raw_time; +		seq = read_seqcount_begin(&timekeeper_seq); +		nsecs = timekeeping_get_ns_raw(tk); +		*ts = tk->raw_time; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	timespec_add_ns(ts, nsecs);  }  EXPORT_SYMBOL(getrawmonotonic); -  /**   * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres   */  int timekeeping_valid_for_hres(void)  { +	struct timekeeper *tk = &timekeeper;  	unsigned long seq;  	int ret;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqcount_begin(&timekeeper_seq); -		ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; +		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return ret;  }  /**   * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change!   */  u64 timekeeping_max_deferment(void)  { -	return timekeeper.clock->max_idle_ns; +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; +	u64 ret; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); + +		ret = tk->clock->max_idle_ns; + +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	return ret;  }  /** @@ -452,7 +761,7 @@ u64 timekeeping_max_deferment(void)   *   *  XXX - Do be sure to remove it once all arches implement it.   */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts)  {  	ts->tv_sec = 0;  	ts->tv_nsec = 0; @@ -467,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)   *   *  XXX - Do be sure to remove it once all arches implement it.   */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts)  {  	ts->tv_sec = 0;  	ts->tv_nsec = 0; @@ -478,125 +787,278 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)   */  void __init timekeeping_init(void)  { +	struct timekeeper *tk = &timekeeper;  	struct clocksource *clock;  	unsigned long flags; -	struct timespec now, boot; +	struct timespec now, boot, tmp;  	read_persistent_clock(&now); -	read_boot_clock(&boot); -	write_seqlock_irqsave(&xtime_lock, flags); +	if (!timespec_valid_strict(&now)) { +		pr_warn("WARNING: Persistent clock returned invalid value!\n" +			"         Check your CMOS/BIOS settings.\n"); +		now.tv_sec = 0; +		now.tv_nsec = 0; +	} else if (now.tv_sec || now.tv_nsec) +		persistent_clock_exist = true; +	read_boot_clock(&boot); +	if (!timespec_valid_strict(&boot)) { +		pr_warn("WARNING: Boot clock returned invalid value!\n" +			"         Check your CMOS/BIOS settings.\n"); +		boot.tv_sec = 0; +		boot.tv_nsec = 0; +	} + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	ntp_init();  	clock = clocksource_default_clock();  	if (clock->enable)  		clock->enable(clock); -	timekeeper_setup_internals(clock); - -	xtime.tv_sec = now.tv_sec; -	xtime.tv_nsec = now.tv_nsec; -	raw_time.tv_sec = 0; -	raw_time.tv_nsec = 0; -	if (boot.tv_sec == 0 && boot.tv_nsec == 0) { -		boot.tv_sec = xtime.tv_sec; -		boot.tv_nsec = xtime.tv_nsec; -	} -	set_normalized_timespec(&wall_to_monotonic, -				-boot.tv_sec, -boot.tv_nsec); -	total_sleep_time.tv_sec = 0; -	total_sleep_time.tv_nsec = 0; -	write_sequnlock_irqrestore(&xtime_lock, flags); +	tk_setup_internals(tk, clock); + +	tk_set_xtime(tk, &now); +	tk->raw_time.tv_sec = 0; +	tk->raw_time.tv_nsec = 0; +	if (boot.tv_sec == 0 && boot.tv_nsec == 0) +		boot = tk_xtime(tk); + +	set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); +	tk_set_wall_to_mono(tk, tmp); + +	tmp.tv_sec = 0; +	tmp.tv_nsec = 0; +	tk_set_sleep_time(tk, tmp); + +	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  }  /* time in seconds when suspend began */  static struct timespec timekeeping_suspend_time;  /** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, +							struct timespec *delta) +{ +	if (!timespec_valid_strict(delta)) { +		printk_deferred(KERN_WARNING +				"__timekeeping_inject_sleeptime: Invalid " +				"sleep delta value!\n"); +		return; +	} +	tk_xtime_add(tk, delta); +	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); +	tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); +	tk_debug_account_sleep_time(delta); +} + +/** + * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec delta value + * + * This hook is for architectures that cannot support read_persistent_clock + * because their RTC/persistent clock is only accessible when irqs are enabled. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime(struct timespec *delta) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; + +	/* +	 * Make sure we don't set the clock twice, as timekeeping_resume() +	 * already did it +	 */ +	if (has_persistent_clock()) +		return; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	timekeeping_forward_now(tk); + +	__timekeeping_inject_sleeptime(tk, delta); + +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	/* signal hrtimers about time change */ +	clock_was_set(); +} + +/**   * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev:	unused   *   * This is for the generic clocksource timekeeping.   * xtime/wall_to_monotonic/jiffies/etc are   * still managed by arch specific suspend/resume code.   */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void)  { +	struct timekeeper *tk = &timekeeper; +	struct clocksource *clock = tk->clock;  	unsigned long flags; -	struct timespec ts; +	struct timespec ts_new, ts_delta; +	cycle_t cycle_now, cycle_delta; +	bool suspendtime_found = false; -	read_persistent_clock(&ts); +	read_persistent_clock(&ts_new); +	clockevents_resume();  	clocksource_resume(); -	write_seqlock_irqsave(&xtime_lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); -	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { -		ts = timespec_sub(ts, timekeeping_suspend_time); -		xtime = timespec_add(xtime, ts); -		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); -		total_sleep_time = timespec_add(total_sleep_time, ts); +	/* +	 * After system resumes, we need to calculate the suspended time and +	 * compensate it for the OS time. There are 3 sources that could be +	 * used: Nonstop clocksource during suspend, persistent clock and rtc +	 * device. +	 * +	 * One specific platform may have 1 or 2 or all of them, and the +	 * preference will be: +	 *	suspend-nonstop clocksource -> persistent clock -> rtc +	 * The less preferred source will only be tried if there is no better +	 * usable source. The rtc part is handled separately in rtc core code. +	 */ +	cycle_now = clock->read(clock); +	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && +		cycle_now > clock->cycle_last) { +		u64 num, max = ULLONG_MAX; +		u32 mult = clock->mult; +		u32 shift = clock->shift; +		s64 nsec = 0; + +		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + +		/* +		 * "cycle_delta * mutl" may cause 64 bits overflow, if the +		 * suspended time is too long. In that case we need do the +		 * 64 bits math carefully +		 */ +		do_div(max, mult); +		if (cycle_delta > max) { +			num = div64_u64(cycle_delta, max); +			nsec = (((u64) max * mult) >> shift) * num; +			cycle_delta -= num * max; +		} +		nsec += ((u64) cycle_delta * mult) >> shift; + +		ts_delta = ns_to_timespec(nsec); +		suspendtime_found = true; +	} else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { +		ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); +		suspendtime_found = true;  	} -	/* re-base the last cycle value */ -	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); -	timekeeper.ntp_error = 0; + +	if (suspendtime_found) +		__timekeeping_inject_sleeptime(tk, &ts_delta); + +	/* Re-base the last cycle value */ +	tk->cycle_last = clock->cycle_last = cycle_now; +	tk->ntp_error = 0;  	timekeeping_suspended = 0; -	write_sequnlock_irqrestore(&xtime_lock, flags); +	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	touch_softlockup_watchdog();  	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);  	/* Resume hrtimers */ -	hres_timers_resume(); - -	return 0; +	hrtimers_resume();  } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void)  { +	struct timekeeper *tk = &timekeeper;  	unsigned long flags; +	struct timespec		delta, delta_delta; +	static struct timespec	old_delta;  	read_persistent_clock(&timekeeping_suspend_time); -	write_seqlock_irqsave(&xtime_lock, flags); -	timekeeping_forward_now(); +	/* +	 * On some systems the persistent_clock can not be detected at +	 * timekeeping_init by its return value, so if we see a valid +	 * value returned, update the persistent_clock_exists flag. +	 */ +	if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) +		persistent_clock_exist = true; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); +	timekeeping_forward_now(tk);  	timekeeping_suspended = 1; -	write_sequnlock_irqrestore(&xtime_lock, flags); + +	/* +	 * To avoid drift caused by repeated suspend/resumes, +	 * which each can add ~1 second drift error, +	 * try to compensate so the difference in system time +	 * and persistent_clock time stays close to constant. +	 */ +	delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); +	delta_delta = timespec_sub(delta, old_delta); +	if (abs(delta_delta.tv_sec)  >= 2) { +		/* +		 * if delta_delta is too large, assume time correction +		 * has occured and set old_delta to the current delta. +		 */ +		old_delta = delta; +	} else { +		/* Otherwise try to adjust old_system to compensate */ +		timekeeping_suspend_time = +			timespec_add(timekeeping_suspend_time, delta_delta); +	} + +	timekeeping_update(tk, TK_MIRROR); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);  	clocksource_suspend(); +	clockevents_suspend();  	return 0;  }  /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { -	.name		= "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = {  	.resume		= timekeeping_resume,  	.suspend	= timekeeping_suspend,  }; -static struct sys_device device_timer = { -	.id		= 0, -	.cls		= &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void)  { -	int error = sysdev_class_register(&timekeeping_sysclass); -	if (!error) -		error = sysdev_register(&device_timer); -	return error; +	register_syscore_ops(&timekeeping_syscore_ops); +	return 0;  } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops);  /*   * If the error is already larger, we look ahead even further   * to compensate for late or lost adjustments.   */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, +						 s64 error, s64 *interval,  						 s64 *offset)  {  	s64 tick_error, i; @@ -612,7 +1074,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,  	 * here.  This is tuned so that an error of about 1 msec is adjusted  	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).  	 */ -	error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); +	error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);  	error2 = abs(error2);  	for (look_ahead = 0; error2 > 0; look_ahead++)  		error2 >>= 2; @@ -621,8 +1083,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,  	 * Now calculate the error in (1 << look_ahead) ticks, but first  	 * remove the single look ahead already included in the error.  	 */ -	tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); -	tick_error -= timekeeper.xtime_interval >> 1; +	tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); +	tick_error -= tk->xtime_interval >> 1;  	error = ((error - tick_error) >> look_ahead) + tick_error;  	/* Finally calculate the adjustment shift value.  */ @@ -647,36 +1109,175 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,   * this is optimized for the most common adjustments of -1,0,1,   * for other values we can do a bit more work.   */ -static void timekeeping_adjust(s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset)  { -	s64 error, interval = timekeeper.cycle_interval; +	s64 error, interval = tk->cycle_interval;  	int adj; -	error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); +	/* +	 * The point of this is to check if the error is greater than half +	 * an interval. +	 * +	 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. +	 * +	 * Note we subtract one in the shift, so that error is really error*2. +	 * This "saves" dividing(shifting) interval twice, but keeps the +	 * (error > interval) comparison as still measuring if error is +	 * larger than half an interval. +	 * +	 * Note: It does not "save" on aggravation when reading the code. +	 */ +	error = tk->ntp_error >> (tk->ntp_error_shift - 1);  	if (error > interval) { +		/* +		 * We now divide error by 4(via shift), which checks if +		 * the error is greater than twice the interval. +		 * If it is greater, we need a bigadjust, if its smaller, +		 * we can adjust by 1. +		 */  		error >>= 2;  		if (likely(error <= interval))  			adj = 1;  		else -			adj = timekeeping_bigadjust(error, &interval, &offset); -	} else if (error < -interval) { -		error >>= 2; -		if (likely(error >= -interval)) { -			adj = -1; -			interval = -interval; -			offset = -offset; -		} else -			adj = timekeeping_bigadjust(error, &interval, &offset); -	} else -		return; +			adj = timekeeping_bigadjust(tk, error, &interval, &offset); +	} else { +		if (error < -interval) { +			/* See comment above, this is just switched for the negative */ +			error >>= 2; +			if (likely(error >= -interval)) { +				adj = -1; +				interval = -interval; +				offset = -offset; +			} else { +				adj = timekeeping_bigadjust(tk, error, &interval, &offset); +			} +		} else { +			goto out_adjust; +		} +	} + +	if (unlikely(tk->clock->maxadj && +		(tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { +		printk_deferred_once(KERN_WARNING +			"Adjusting %s more than 11%% (%ld vs %ld)\n", +			tk->clock->name, (long)tk->mult + adj, +			(long)tk->clock->mult + tk->clock->maxadj); +	} +	/* +	 * So the following can be confusing. +	 * +	 * To keep things simple, lets assume adj == 1 for now. +	 * +	 * When adj != 1, remember that the interval and offset values +	 * have been appropriately scaled so the math is the same. +	 * +	 * The basic idea here is that we're increasing the multiplier +	 * by one, this causes the xtime_interval to be incremented by +	 * one cycle_interval. This is because: +	 *	xtime_interval = cycle_interval * mult +	 * So if mult is being incremented by one: +	 *	xtime_interval = cycle_interval * (mult + 1) +	 * Its the same as: +	 *	xtime_interval = (cycle_interval * mult) + cycle_interval +	 * Which can be shortened to: +	 *	xtime_interval += cycle_interval +	 * +	 * So offset stores the non-accumulated cycles. Thus the current +	 * time (in shifted nanoseconds) is: +	 *	now = (offset * adj) + xtime_nsec +	 * Now, even though we're adjusting the clock frequency, we have +	 * to keep time consistent. In other words, we can't jump back +	 * in time, and we also want to avoid jumping forward in time. +	 * +	 * So given the same offset value, we need the time to be the same +	 * both before and after the freq adjustment. +	 *	now = (offset * adj_1) + xtime_nsec_1 +	 *	now = (offset * adj_2) + xtime_nsec_2 +	 * So: +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * adj_2) + xtime_nsec_2 +	 * And we know: +	 *	adj_2 = adj_1 + 1 +	 * So: +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * (adj_1+1)) + xtime_nsec_2 +	 *	(offset * adj_1) + xtime_nsec_1 = +	 *		(offset * adj_1) + offset + xtime_nsec_2 +	 * Canceling the sides: +	 *	xtime_nsec_1 = offset + xtime_nsec_2 +	 * Which gives us: +	 *	xtime_nsec_2 = xtime_nsec_1 - offset +	 * Which simplfies to: +	 *	xtime_nsec -= offset +	 * +	 * XXX - TODO: Doc ntp_error calculation. +	 */ +	tk->mult += adj; +	tk->xtime_interval += interval; +	tk->xtime_nsec -= offset; +	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; + +out_adjust: +	/* +	 * It may be possible that when we entered this function, xtime_nsec +	 * was very small.  Further, if we're slightly speeding the clocksource +	 * in the code above, its possible the required corrective factor to +	 * xtime_nsec could cause it to underflow. +	 * +	 * Now, since we already accumulated the second, cannot simply roll +	 * the accumulated second back, since the NTP subsystem has been +	 * notified via second_overflow. So instead we push xtime_nsec forward +	 * by the amount we underflowed, and add that amount into the error. +	 * +	 * We'll correct this error next time through this function, when +	 * xtime_nsec is not as small. +	 */ +	if (unlikely((s64)tk->xtime_nsec < 0)) { +		s64 neg = -(s64)tk->xtime_nsec; +		tk->xtime_nsec = 0; +		tk->ntp_error += neg << tk->ntp_error_shift; +	} -	timekeeper.mult += adj; -	timekeeper.xtime_interval += interval; -	timekeeper.xtime_nsec -= offset; -	timekeeper.ntp_error -= (interval - offset) << -				timekeeper.ntp_error_shift;  } +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) +{ +	u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; +	unsigned int clock_set = 0; + +	while (tk->xtime_nsec >= nsecps) { +		int leap; + +		tk->xtime_nsec -= nsecps; +		tk->xtime_sec++; + +		/* Figure out if its a leap sec and apply if needed */ +		leap = second_overflow(tk->xtime_sec); +		if (unlikely(leap)) { +			struct timespec ts; + +			tk->xtime_sec += leap; + +			ts.tv_sec = leap; +			ts.tv_nsec = 0; +			tk_set_wall_to_mono(tk, +				timespec_sub(tk->wall_to_monotonic, ts)); + +			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + +			clock_set = TK_CLOCK_WAS_SET; +		} +	} +	return clock_set; +}  /**   * logarithmic_accumulation - shifted accumulation of cycles @@ -687,143 +1288,164 @@ static void timekeeping_adjust(s64 offset)   *   * Returns the unconsumed cycles.   */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, +						u32 shift, +						unsigned int *clock_set)  { -	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; +	cycle_t interval = tk->cycle_interval << shift;  	u64 raw_nsecs;  	/* If the offset is smaller then a shifted interval, do nothing */ -	if (offset < timekeeper.cycle_interval<<shift) +	if (offset < interval)  		return offset;  	/* Accumulate one shifted interval */ -	offset -= timekeeper.cycle_interval << shift; -	timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; - -	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; -	while (timekeeper.xtime_nsec >= nsecps) { -		timekeeper.xtime_nsec -= nsecps; -		xtime.tv_sec++; -		second_overflow(); -	} +	offset -= interval; +	tk->cycle_last += interval; + +	tk->xtime_nsec += tk->xtime_interval << shift; +	*clock_set |= accumulate_nsecs_to_secs(tk);  	/* Accumulate raw time */ -	raw_nsecs = timekeeper.raw_interval << shift; -	raw_nsecs += raw_time.tv_nsec; +	raw_nsecs = (u64)tk->raw_interval << shift; +	raw_nsecs += tk->raw_time.tv_nsec;  	if (raw_nsecs >= NSEC_PER_SEC) {  		u64 raw_secs = raw_nsecs;  		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); -		raw_time.tv_sec += raw_secs; +		tk->raw_time.tv_sec += raw_secs;  	} -	raw_time.tv_nsec = raw_nsecs; +	tk->raw_time.tv_nsec = raw_nsecs;  	/* Accumulate error between NTP and clock interval */ -	timekeeper.ntp_error += tick_length << shift; -	timekeeper.ntp_error -= timekeeper.xtime_interval << -				(timekeeper.ntp_error_shift + shift); +	tk->ntp_error += ntp_tick_length() << shift; +	tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << +						(tk->ntp_error_shift + shift);  	return offset;  } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ +	s64 remainder; + +	/* +	* Store only full nanoseconds into xtime_nsec after rounding +	* it up and add the remainder to the error difference. +	* XXX - This is necessary to avoid small 1ns inconsistnecies caused +	* by truncating the remainder in vsyscalls. However, it causes +	* additional work to be done in timekeeping_adjust(). Once +	* the vsyscall implementations are converted to use xtime_nsec +	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD +	* users are removed, this can be killed. +	*/ +	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); +	tk->xtime_nsec -= remainder; +	tk->xtime_nsec += 1ULL << tk->shift; +	tk->ntp_error += remainder << tk->ntp_error_shift; +	tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + +  /**   * update_wall_time - Uses the current clocksource to increment the wall time   * - * Called from the timer interrupt, must hold a write on xtime_lock.   */  void update_wall_time(void)  {  	struct clocksource *clock; +	struct timekeeper *real_tk = &timekeeper; +	struct timekeeper *tk = &shadow_timekeeper;  	cycle_t offset;  	int shift = 0, maxshift; +	unsigned int clock_set = 0; +	unsigned long flags; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	/* Make sure we're fully resumed: */  	if (unlikely(timekeeping_suspended)) -		return; +		goto out; -	clock = timekeeper.clock; +	clock = real_tk->clock;  #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -	offset = timekeeper.cycle_interval; +	offset = real_tk->cycle_interval;  #else  	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;  #endif -	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + +	/* Check if there's really nothing to do */ +	if (offset < real_tk->cycle_interval) +		goto out;  	/*  	 * With NO_HZ we may have to accumulate many cycle_intervals  	 * (think "ticks") worth of time at once. To do this efficiently,  	 * we calculate the largest doubling multiple of cycle_intervals -	 * that is smaller then the offset. We then accumulate that +	 * that is smaller than the offset.  We then accumulate that  	 * chunk in one go, and then try to consume the next smaller  	 * doubled multiple.  	 */ -	shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); +	shift = ilog2(offset) - ilog2(tk->cycle_interval);  	shift = max(0, shift); -	/* Bound shift to one less then what overflows tick_length */ -	maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; +	/* Bound shift to one less than what overflows tick_length */ +	maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;  	shift = min(shift, maxshift); -	while (offset >= timekeeper.cycle_interval) { -		offset = logarithmic_accumulation(offset, shift); -		if(offset < timekeeper.cycle_interval<<shift) +	while (offset >= tk->cycle_interval) { +		offset = logarithmic_accumulation(tk, offset, shift, +							&clock_set); +		if (offset < tk->cycle_interval<<shift)  			shift--;  	}  	/* correct the clock when NTP error is too big */ -	timekeeping_adjust(offset); +	timekeeping_adjust(tk, offset);  	/* -	 * Since in the loop above, we accumulate any amount of time -	 * in xtime_nsec over a second into xtime.tv_sec, its possible for -	 * xtime_nsec to be fairly small after the loop. Further, if we're -	 * slightly speeding the clocksource up in timekeeping_adjust(), -	 * its possible the required corrective factor to xtime_nsec could -	 * cause it to underflow. -	 * -	 * Now, we cannot simply roll the accumulated second back, since -	 * the NTP subsystem has been notified via second_overflow. So -	 * instead we push xtime_nsec forward by the amount we underflowed, -	 * and add that amount into the error. -	 * -	 * We'll correct this error next time through this function, when -	 * xtime_nsec is not as small. +	 * XXX This can be killed once everyone converts +	 * to the new update_vsyscall.  	 */ -	if (unlikely((s64)timekeeper.xtime_nsec < 0)) { -		s64 neg = -(s64)timekeeper.xtime_nsec; -		timekeeper.xtime_nsec = 0; -		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; -	} - +	old_vsyscall_fixup(tk);  	/* -	 * Store full nanoseconds into xtime after rounding it up and -	 * add the remainder to the error difference. +	 * Finally, make sure that after the rounding +	 * xtime_nsec isn't larger than NSEC_PER_SEC  	 */ -	xtime.tv_nsec =	((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; -	timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; -	timekeeper.ntp_error +=	timekeeper.xtime_nsec << -				timekeeper.ntp_error_shift; +	clock_set |= accumulate_nsecs_to_secs(tk); +	write_seqcount_begin(&timekeeper_seq); +	/* Update clock->cycle_last with the new value */ +	clock->cycle_last = tk->cycle_last;  	/* -	 * Finally, make sure that after the rounding -	 * xtime.tv_nsec isn't larger then NSEC_PER_SEC +	 * Update the real timekeeper. +	 * +	 * We could avoid this memcpy by switching pointers, but that +	 * requires changes to all other timekeeper usage sites as +	 * well, i.e. move the timekeeper pointer getter into the +	 * spinlocked/seqcount protected sections. And we trade this +	 * memcpy under the timekeeper_seq against one before we start +	 * updating.  	 */ -	if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { -		xtime.tv_nsec -= NSEC_PER_SEC; -		xtime.tv_sec++; -		second_overflow(); -	} - -	/* check to see if there is a new clocksource to use */ -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	memcpy(real_tk, tk, sizeof(*tk)); +	timekeeping_update(real_tk, clock_set); +	write_seqcount_end(&timekeeper_seq); +out: +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +	if (clock_set) +		/* Have to call _delayed version, since in irq context*/ +		clock_was_set_delayed();  }  /**   * getboottime - Return the real time of system boot.   * @ts:		pointer to the timespec to be set   * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec.   *   * This is based on the wall_to_monotonic offset and the total suspend   * time. Calls to settimeofday will affect the value returned (which @@ -832,9 +1454,12 @@ void update_wall_time(void)   */  void getboottime(struct timespec *ts)  { +	struct timekeeper *tk = &timekeeper;  	struct timespec boottime = { -		.tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, -		.tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec +		.tv_sec = tk->wall_to_monotonic.tv_sec + +				tk->total_sleep_time.tv_sec, +		.tv_nsec = tk->wall_to_monotonic.tv_nsec + +				tk->total_sleep_time.tv_nsec  	};  	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); @@ -842,41 +1467,93 @@ void getboottime(struct timespec *ts)  EXPORT_SYMBOL_GPL(getboottime);  /** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts:		pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ +	struct timekeeper *tk = &timekeeper; +	struct timespec tomono, sleep; +	s64 nsec; +	unsigned int seq; + +	WARN_ON(timekeeping_suspended); + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); +		ts->tv_sec = tk->xtime_sec; +		nsec = timekeeping_get_ns(tk); +		tomono = tk->wall_to_monotonic; +		sleep = tk->total_sleep_time; + +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	ts->tv_sec += tomono.tv_sec + sleep.tv_sec; +	ts->tv_nsec = 0; +	timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ +	struct timespec ts; + +	get_monotonic_boottime(&ts); +	return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + +/**   * monotonic_to_bootbased - Convert the monotonic time to boot based.   * @ts:		pointer to the timespec to be converted   */  void monotonic_to_bootbased(struct timespec *ts)  { -	*ts = timespec_add(*ts, total_sleep_time); +	struct timekeeper *tk = &timekeeper; + +	*ts = timespec_add(*ts, tk->total_sleep_time);  }  EXPORT_SYMBOL_GPL(monotonic_to_bootbased);  unsigned long get_seconds(void)  { -	return xtime.tv_sec; +	struct timekeeper *tk = &timekeeper; + +	return tk->xtime_sec;  }  EXPORT_SYMBOL(get_seconds);  struct timespec __current_kernel_time(void)  { -	return xtime; -} +	struct timekeeper *tk = &timekeeper; -struct timespec __get_wall_to_monotonic(void) -{ -	return wall_to_monotonic; +	return tk_xtime(tk);  }  struct timespec current_kernel_time(void)  { +	struct timekeeper *tk = &timekeeper;  	struct timespec now;  	unsigned long seq;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqcount_begin(&timekeeper_seq); -		now = xtime; -	} while (read_seqretry(&xtime_lock, seq)); +		now = tk_xtime(tk); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return now;  } @@ -884,17 +1561,184 @@ EXPORT_SYMBOL(current_kernel_time);  struct timespec get_monotonic_coarse(void)  { +	struct timekeeper *tk = &timekeeper;  	struct timespec now, mono;  	unsigned long seq;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqcount_begin(&timekeeper_seq); -		now = xtime; -		mono = wall_to_monotonic; -	} while (read_seqretry(&xtime_lock, seq)); +		now = tk_xtime(tk); +		mono = tk->wall_to_monotonic; +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,  				now.tv_nsec + mono.tv_nsec);  	return now;  } + +/* + * Must hold jiffies_lock + */ +void do_timer(unsigned long ticks) +{ +	jiffies_64 += ticks; +	calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + *    and sleep offsets. + * @xtim:	pointer to timespec to be set with xtime + * @wtom:	pointer to timespec to be set with wall_to_monotonic + * @sleep:	pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, +				struct timespec *wtom, struct timespec *sleep) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); +		*xtim = tk_xtime(tk); +		*wtom = tk->wall_to_monotonic; +		*sleep = tk->total_sleep_time; +	} while (read_seqcount_retry(&timekeeper_seq, seq)); +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real:	pointer to storage for monotonic -> realtime offset + * @offs_boot:	pointer to storage for monotonic -> boottime offset + * @offs_tai:	pointer to storage for monotonic -> clock tai offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interrupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +							ktime_t *offs_tai) +{ +	struct timekeeper *tk = &timekeeper; +	ktime_t now; +	unsigned int seq; +	u64 secs, nsecs; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); + +		secs = tk->xtime_sec; +		nsecs = timekeeping_get_ns(tk); + +		*offs_real = tk->offs_real; +		*offs_boot = tk->offs_boot; +		*offs_tai = tk->offs_tai; +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	now = ktime_add_ns(ktime_set(secs, 0), nsecs); +	now = ktime_sub(now, *offs_real); +	return now; +} +#endif + +/** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; +	struct timespec wtom; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); +		wtom = tk->wall_to_monotonic; +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	return timespec_to_ktime(wtom); +} +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; +	struct timespec ts; +	s32 orig_tai, tai; +	int ret; + +	/* Validate the data before disabling interrupts */ +	ret = ntp_validate_timex(txc); +	if (ret) +		return ret; + +	if (txc->modes & ADJ_SETOFFSET) { +		struct timespec delta; +		delta.tv_sec  = txc->time.tv_sec; +		delta.tv_nsec = txc->time.tv_usec; +		if (!(txc->modes & ADJ_NANO)) +			delta.tv_nsec *= 1000; +		ret = timekeeping_inject_offset(&delta); +		if (ret) +			return ret; +	} + +	getnstimeofday(&ts); + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	orig_tai = tai = tk->tai_offset; +	ret = __do_adjtimex(txc, &ts, &tai); + +	if (tai != orig_tai) { +		__timekeeping_set_tai_offset(tk, tai); +		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); +	} +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	if (tai != orig_tai) +		clock_was_set(); + +	ntp_notify_cmos_timer(); + +	return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	__hardpps(phase_ts, raw_ts); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks:	number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ +	write_seqlock(&jiffies_lock); +	do_timer(ticks); +	write_sequnlock(&jiffies_lock); +	update_wall_time(); +} diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 00000000000..4d54f97558d --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,74 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/time.h> + +#include "timekeeping_internal.h" + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ +	unsigned int bin; +	seq_puts(s, "      time (secs)        count\n"); +	seq_puts(s, "------------------------------\n"); +	for (bin = 0; bin < 32; bin++) { +		if (sleep_time_bin[bin] == 0) +			continue; +		seq_printf(s, "%10u - %-10u %4u\n", +			bin ? 1 << (bin - 1) : 0, 1 << bin, +				sleep_time_bin[bin]); +	} +	return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ +	return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { +	.open		= tk_debug_sleep_time_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ +	struct dentry *d; + +	d = debugfs_create_file("sleep_time", 0444, NULL, NULL, +		&tk_debug_sleep_time_fops); +	if (!d) { +		pr_err("Failed to create sleep_time debug file\n"); +		return -ENOMEM; +	} + +	return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ +	sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 00000000000..13323ea08ff --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa9..61ed862cdd3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@  #include <asm/uaccess.h> + +struct timer_list_iter { +	int cpu; +	bool second_pass; +	u64 now; +}; +  typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -41,7 +48,7 @@ static void print_name_offset(struct seq_file *m, void *sym)  	char symname[KSYM_NAME_LEN];  	if (lookup_symbol_name((unsigned long)sym, symname) < 0) -		SEQ_printf(m, "<%p>", sym); +		SEQ_printf(m, "<%pK>", sym);  	else  		SEQ_printf(m, "%s", symname);  } @@ -79,26 +86,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,  {  	struct hrtimer *timer, tmp;  	unsigned long next = 0, i; -	struct rb_node *curr; +	struct timerqueue_node *curr;  	unsigned long flags;  next_one:  	i = 0;  	raw_spin_lock_irqsave(&base->cpu_base->lock, flags); -	curr = base->first; +	curr = timerqueue_getnext(&base->active);  	/*  	 * Crude but we have to do this O(N*N) thing, because  	 * we have to unlock the base when printing:  	 */  	while (curr && i < next) { -		curr = rb_next(curr); +		curr = timerqueue_iterate_next(curr);  		i++;  	}  	if (curr) { -		timer = rb_entry(curr, struct hrtimer, node); +		timer = container_of(curr, struct hrtimer, node);  		tmp = *timer;  		raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); @@ -112,7 +119,7 @@ next_one:  static void  print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)  { -	SEQ_printf(m, "  .base:       %p\n", base); +	SEQ_printf(m, "  .base:       %pK\n", base);  	SEQ_printf(m, "  .index:      %d\n",  			base->index);  	SEQ_printf(m, "  .resolution: %Lu nsecs\n", @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; -	SEQ_printf(m, "\n");  	SEQ_printf(m, "cpu: %d\n", cpu);  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {  		SEQ_printf(m, " clock %d:\n", i); @@ -167,7 +173,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  	{  		struct tick_sched *ts = tick_get_tick_sched(cpu);  		P(nohz_mode); -		P_ns(idle_tick); +		P_ns(last_tick);  		P(tick_stopped);  		P(idle_jiffies);  		P(idle_calls); @@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  #undef P  #undef P_ns +	SEQ_printf(m, "\n");  }  #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  {  	struct clock_event_device *dev = td->evtdev; -	SEQ_printf(m, "\n");  	SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);  	if (cpu < 0)  		SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  	print_name_offset(m, dev->event_handler);  	SEQ_printf(m, "\n");  	SEQ_printf(m, " retries:        %lu\n", dev->retries); +	SEQ_printf(m, "\n");  } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m)  { -	int cpu; -  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST  	print_tickdevice(m, tick_get_broadcast_device(), -1);  	SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,47 +251,111 @@ static void timer_list_show_tickdevices(struct seq_file *m)  #endif  	SEQ_printf(m, "\n");  #endif -	for_each_online_cpu(cpu) -		print_tickdevice(m, tick_get_device(cpu), cpu); -	SEQ_printf(m, "\n");  } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { }  #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ +	SEQ_printf(m, "Timer List Version: v0.7\n"); +	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); +	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); +	SEQ_printf(m, "\n"); +} +  static int timer_list_show(struct seq_file *m, void *v)  { +	struct timer_list_iter *iter = v; + +	if (iter->cpu == -1 && !iter->second_pass) +		timer_list_header(m, iter->now); +	else if (!iter->second_pass) +		print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS +	else if (iter->cpu == -1 && iter->second_pass) +		timer_list_show_tickdevices_header(m); +	else +		print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif +	return 0; +} + +void sysrq_timer_list_show(void) +{  	u64 now = ktime_to_ns(ktime_get());  	int cpu; -	SEQ_printf(m, "Timer List Version: v0.6\n"); -	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); -	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); +	timer_list_header(NULL, now);  	for_each_online_cpu(cpu) -		print_cpu(m, cpu, now); +		print_cpu(NULL, cpu, now); -	SEQ_printf(m, "\n"); -	timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS +	timer_list_show_tickdevices_header(NULL); +	for_each_online_cpu(cpu) +		print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif +	return; +} -	return 0; +static void *move_iter(struct timer_list_iter *iter, loff_t offset) +{ +	for (; offset; offset--) { +		iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); +		if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS +			if (!iter->second_pass) { +				iter->cpu = -1; +				iter->second_pass = true; +			} else +				return NULL; +#else +			return NULL; +#endif +		} +	} +	return iter;  } -void sysrq_timer_list_show(void) +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ +	struct timer_list_iter *iter = file->private; + +	if (!*offset) +		iter->now = ktime_to_ns(ktime_get()); +	iter->cpu = -1; +	iter->second_pass = false; +	return move_iter(iter, *offset); +} + +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ +	struct timer_list_iter *iter = file->private; +	++*offset; +	return move_iter(iter, 1); +} + +static void timer_list_stop(struct seq_file *seq, void *v)  { -	timer_list_show(NULL, NULL);  } +static const struct seq_operations timer_list_sops = { +	.start = timer_list_start, +	.next = timer_list_next, +	.stop = timer_list_stop, +	.show = timer_list_show, +}; +  static int timer_list_open(struct inode *inode, struct file *filp)  { -	return single_open(filp, timer_list_show, NULL); +	return seq_open_private(filp, &timer_list_sops, +			sizeof(struct timer_list_iter));  }  static const struct file_operations timer_list_fops = {  	.open		= timer_list_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= seq_release_private,  };  static int __init init_timer_list_procfs(void) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7..1fb08f21302 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry {  /*   * Spinlock protecting the tables - not taken during lookup:   */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock);  /*   * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)  	prev = NULL;  	curr = *head; -	spin_lock(&table_lock); +	raw_spin_lock(&table_lock);  	/*  	 * Make sure we have not raced with another CPU:  	 */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)  			*head = curr;  	}   out_unlock: -	spin_unlock(&table_lock); +	raw_spin_unlock(&table_lock);  	return curr;  } @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,  			      unsigned int timer_flag)  {  	/* -	 * It doesnt matter which lock we take: +	 * It doesn't matter which lock we take:  	 */  	raw_spinlock_t *lock;  	struct entry *entry, input; @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)  	period = ktime_to_timespec(time);  	ms = period.tv_nsec / 1000000; -	seq_puts(m, "Timer Stats Version: v0.2\n"); +	seq_puts(m, "Timer Stats Version: v0.3\n");  	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);  	if (atomic_read(&overflow_count)) -		seq_printf(m, "Overflow: %d entries\n", -			atomic_read(&overflow_count)); +		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); +	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");  	for (i = 0; i < nr_entries; i++) {  		entry = entries + i; - 		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { +		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {  			seq_printf(m, "%4luD, %5d %-16s ",  				entry->count, entry->pid, entry->comm);  		} else {  | 
