diff options
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 199 | ||||
| -rw-r--r-- | kernel/time/Makefile | 10 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 861 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 581 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 541 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 64 | ||||
| -rw-r--r-- | kernel/time/ntp.c | 701 | ||||
| -rw-r--r-- | kernel/time/ntp_internal.h | 12 | ||||
| -rw-r--r-- | kernel/time/posix-clock.c | 446 | ||||
| -rw-r--r-- | kernel/time/sched_clock.c | 217 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 106 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 546 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 257 | ||||
| -rw-r--r-- | kernel/time/tick-internal.h | 51 | ||||
| -rw-r--r-- | kernel/time/tick-oneshot.c | 54 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 879 | ||||
| -rw-r--r-- | kernel/time/timecompare.c | 191 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 1528 | ||||
| -rw-r--r-- | kernel/time/timekeeping_debug.c | 74 | ||||
| -rw-r--r-- | kernel/time/timekeeping_internal.h | 14 | ||||
| -rw-r--r-- | kernel/time/timer_list.c | 127 | ||||
| -rw-r--r-- | kernel/time/timer_stats.c | 16 |
22 files changed, 5987 insertions, 1488 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0..f448513a45e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,29 +1,204 @@ # # Timer subsystem related configuration options # + +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG + bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD + bool + +# ktime_t scalar 64bit nsec representation +config KTIME_SCALAR + bool + +# Old style timekeeping +config ARCH_USES_GETTIMEOFFSET + bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS + bool + +# Migration helper. Builds, but does not invoke +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS + +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST + bool + depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE + bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. config TICK_ONESHOT bool -config NO_HZ - bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS +config NO_HZ_COMMON + bool + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" help - This option enables a tickless system: timer interrupts will - only trigger on an as-needed basis both when the system is - busy and when the system is idle. + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping + depends on SMP + # RCU_USER_QS dependency + depends on HAVE_CONTEXT_TRACKING + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select NO_HZ_COMMON + select RCU_USER_QS + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select IRQ_WORK + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + +endchoice + +config NO_HZ_FULL_ALL + bool "Full dynticks system on all CPUs by default (except CPU 0)" + depends on NO_HZ_FULL + help + If the user doesn't pass the nohz_full boot option to + define the range of full dynticks CPUs, consider that all + CPUs in the system are full dynticks by default. + Note the boot CPU will still be kept outside the range to + handle the timekeeping duty. + +config NO_HZ_FULL_SYSIDLE + bool "Detect full-system idle state for full dynticks system" + depends on NO_HZ_FULL + default n + help + At least one CPU must keep the scheduling-clock tick running for + timekeeping purposes whenever there is a non-idle CPU, where + "non-idle" also includes dynticks CPUs as long as they are + running non-idle tasks. Because the underlying adaptive-tick + support cannot distinguish between all CPUs being idle and + all CPUs each running a single task in dynticks mode, the + underlying support simply ensures that there is always a CPU + handling the scheduling-clock tick, whether or not all CPUs + are idle. This Kconfig option enables scalable detection of + the all-CPUs-idle state, thus allowing the scheduling-clock + tick to be disabled when all CPUs are idle. Note that scalable + detection of the all-CPUs-idle state means that larger systems + will be slower to declare the all-CPUs-idle state. + + Say Y if you would like to help debug all-CPUs-idle detection. + + Say N if you are unsure. + +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + help + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your hardware is not capable then this option only increases the size of the kernel image. -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR - +endmenu +endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06..57a413fd0eb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,8 +1,14 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 00000000000..fe75444ae7e --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,861 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz <john.stultz@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @timer: hrtimer used to schedule events while running + * @gettime: Function to read the time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + ktime_t (*gettime)(void); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + +static struct wakeup_source *ws; + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} + + +static int alarmtimer_rtc_add_device(struct device *dev, + struct class_interface *class_intf) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + + if (rtcdev) + return -EBUSY; + + if (!rtc->ops->set_alarm) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + } + spin_unlock_irqrestore(&rtcdev_lock, flags); + return 0; +} + +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); +} +#else +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } +#endif + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; +} + +/** + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) +{ + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); + + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; + +} + +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + struct rtc_device *rtc; + int i; + int ret; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; +} +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} +#endif + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + ktime_t delta; + unsigned long flags; + struct alarm_base *base = &alarm_bases[type]; + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + freezer_delta = delta; + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} +EXPORT_SYMBOL_GPL(alarm_init); + +/** + * alarm_start - Sets an absolute alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + */ +int alarm_start(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); + spin_unlock_irqrestore(&base->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); + +/** + * alarm_try_to_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running + */ +int alarm_try_to_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(alarm_cancel); + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta.tv64 < 0) + return 0; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires.tv64 > now.tv64) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add(alarm->node.expires, interval); + return overrun; +} +EXPORT_SYMBOL_GPL(alarm_forward); + +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); + + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) +{ + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarm.alarmtimer); + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + + /* Re-add periodic timers */ + if (ptr->it.alarm.interval.tv64) { + ptr->it_overrun += alarm_forward(alarm, now, + ptr->it.alarm.interval); + return ALARMTIMER_RESTART; + } + return ALARMTIMER_NORESTART; +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return hrtimer_get_res(baseid, tp); +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + *tp = ktime_to_timespec(base->gettime()); + return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + struct alarm_base *base; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + base = &alarm_bases[type]; + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarm_timer_get - posix timer_get interface + * @new_timer: k_itimer pointer + * @cur_setting: itimerspec data to fill + * + * Copies the itimerspec data out from the k_itimer + */ +static void alarm_timer_get(struct k_itimer *timr, + struct itimerspec *cur_setting) +{ + memset(cur_setting, 0, sizeof(struct itimerspec)); + + cur_setting->it_interval = + ktime_to_timespec(timr->it.alarm.interval); + cur_setting->it_value = + ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); + return; +} + +/** + * alarm_timer_del - posix timer_del interface + * @timr: k_itimer pointer to be deleted + * + * Cancels any programmed alarms for the given timer. + */ +static int alarm_timer_del(struct k_itimer *timr) +{ + if (!rtcdev) + return -ENOTSUPP; + + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + + return 0; +} + +/** + * alarm_timer_set - posix timer_set interface + * @timr: k_itimer pointer to be deleted + * @flags: timer flags + * @new_setting: itimerspec to be used + * @old_setting: itimerspec being replaced + * + * Sets the timer to new_setting, and starts the timer. + */ +static int alarm_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, + struct itimerspec *old_setting) +{ + ktime_t exp; + + if (!rtcdev) + return -ENOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (old_setting) + alarm_timer_get(timr, old_setting); + + /* If the timer was already set, cancel it */ + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + + /* start the timer */ + timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + exp = timespec_to_ktime(new_setting->it_value); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now; + + now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); + exp = ktime_add(now, exp); + } + + alarm_start(&timr->it.alarm.alarmtimer, exp); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) +{ + struct task_struct *task = (struct task_struct *)alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); + return ALARMTIMER_NORESTART; +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +{ + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return (alarm->data == NULL); +} + + +/** + * update_rmtp - Update remaining timespec value + * @exp: expiration time + * @type: timer type + * @rmtp: user pointer to remaining timepsec value + * + * Helper function that fills in rmtp value with time between + * now and the exp value + */ +static int update_rmtp(ktime_t exp, enum alarmtimer_type type, + struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(exp, alarm_bases[type].gettime()); + + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; + +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp; + struct timespec __user *rmtp; + struct alarm alarm; + int ret = 0; + + exp.tv64 = restart->nanosleep.expires; + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + return ret; +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsreq, struct timespec __user *rmtp) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct alarm alarm; + ktime_t exp; + int ret = 0; + struct restart_block *restart; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].gettime(); + exp = ktime_add(now, exp); + } + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t_thread_info()->restart_block; + restart->fn = alarm_timer_nsleep_restart; + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp.tv64; + restart->nanosleep.rmtp = rmtp; + ret = -ERESTART_RESTARTBLOCK; + +out: + return ret; +} + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + struct platform_device *pdev; + int error = 0; + int i; + struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, + }; + + alarmtimer_rtc_timer_init(); + + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + } + + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + + error = platform_driver_register(&alarmtimer_driver); + if (error) + goto out_if; + + pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (IS_ERR(pdev)) { + error = PTR_ERR(pdev); + goto out_drv; + } + ws = wakeup_source_register("alarmtimer"); + return 0; + +out_drv: + platform_driver_unregister(&alarmtimer_driver); +out_if: + alarmtimer_rtc_interface_remove(); + return error; +} +device_initcall(alarmtimer_init); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 6f740d9f094..9c94c19f130 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,46 +15,82 @@ #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/notifier.h> #include <linux/smp.h> -#include <linux/sysdev.h> -#include <linux/tick.h> +#include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -95,66 +131,145 @@ void clockevents_shutdown(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + return dev->set_next_event((unsigned long) clc, dev); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + /** * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - ktime_t now) + bool force) { unsigned long long clc; int64_t delta; + int rc; if (unlikely(expires.tv64 < 0)) { WARN_ON_ONCE(1); return -ETIME; } - delta = ktime_to_ns(ktime_sub(expires, now)); - - if (delta <= 0) - return -ETIME; - dev->next_event = expires; if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) return 0; - if (delta > dev->max_delta_ns) - delta = dev->max_delta_ns; - if (delta < dev->min_delta_ns) - delta = dev->min_delta_ns; - - clc = delta * dev->mult; - clc >>= dev->shift; - - return dev->set_next_event((unsigned long) clc, dev); -} + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); - return ret; -} + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); + return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* @@ -170,9 +285,93 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); + } +} + +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; } +EXPORT_SYMBOL_GPL(clockevents_unbind); /** * clockevents_register_device - register a clock event device @@ -183,18 +382,103 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - BUG_ON(!dev->cpumask); + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); +void clockevents_config(struct clock_event_device *dev, u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} +EXPORT_SYMBOL_GPL(clockevents_config_and_register); + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + + return 0; +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; +} + /* * Noop handler when we shut down an event device */ @@ -220,6 +504,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); @@ -232,21 +517,72 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume) + dev->resume(dev); +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + ret = tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. @@ -259,7 +595,8 @@ void clockevents_notify(unsigned long reason, void *arg) cpu = *((int *)arg); list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1) { + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); list_del(&dev->list); } @@ -269,6 +606,126 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + ssize_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d3..ba3e502c955 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,14 +23,16 @@ * o Allow clocksource drivers to be unregistered */ +#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/sysdev.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ #include <linux/tick.h> #include <linux/kthread.h> +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -113,7 +115,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +124,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +133,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +142,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; @@ -152,6 +154,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) */ for (sft = 32; sft > 0; sft--) { tmp = (u64) to << sft; + tmp += from / 2; do_div(tmp, from); if ((tmp >> sftacc) == 0) break; @@ -173,19 +176,20 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); -static cycle_t watchdog_last; static int watchdog_running; +static atomic_t watchdog_reset_pending; static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,16 +251,13 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int next_cpu; + int next_cpu, reset_pending; spin_lock(&watchdog_lock); if (!watchdog_running) goto out; - wdnow = watchdog->read(watchdog); - wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, - watchdog->mult, watchdog->shift); - watchdog_last = wdnow; + reset_pending = atomic_read(&watchdog_reset_pending); list_for_each_entry(cs, &watchdog_list, wd_list) { @@ -267,20 +268,33 @@ static void clocksource_watchdog(unsigned long data) continue; } + local_irq_disable(); csnow = cs->read(cs); + wdnow = watchdog->read(watchdog); + local_irq_enable(); /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; + cs->wd_last = wdnow; + cs->cs_last = csnow; continue; } - /* Check the deviation from the watchdog clocksource. */ - cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, + watchdog->mult, watchdog->shift); + + cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & cs->mask, cs->mult, cs->shift); - cs->wd_last = csnow; - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + cs->cs_last = csnow; + cs->wd_last = wdnow; + + if (atomic_read(&watchdog_reset_pending)) + continue; + + /* Check the deviation from the watchdog clocksource. */ + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } @@ -288,17 +302,41 @@ static void clocksource_watchdog(unsigned long data) if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + /* - * We just marked the clocksource as highres-capable, - * notify the rest of the system as well so that we - * transition into high-res mode: + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. */ - tick_clock_notify(); + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } } } /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ @@ -317,7 +355,6 @@ static inline void clocksource_start_watchdog(void) return; init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; - watchdog_last = watchdog->read(watchdog); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; @@ -341,11 +378,7 @@ static inline void clocksource_reset_watchdog(void) static void clocksource_resume_watchdog(void) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - clocksource_reset_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_reset_pending); } static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -375,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; LIST_HEAD(unstable); + int select = 0; - mutex_lock(&clocksource_mutex); spin_lock_irqsave(&watchdog_lock, flags); - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); list_add(&cs->wd_list, &unstable); + select = 1; } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; + } + } /* Check if the watchdog timer needs to be stopped. */ clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -422,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data) list_del_init(&cs->wd_list); __clocksource_change_rating(cs, 0); } + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -436,11 +477,25 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** + * clocksource_suspend - suspend the clocksource(s) + */ +void clocksource_suspend(void) +{ + struct clocksource *cs; + + list_for_each_entry_reverse(cs, &clocksource_list, list) + if (cs->suspend) + cs->suspend(cs); +} + +/** * clocksource_resume - resume the clocksource(s) */ void clocksource_resume(void) @@ -449,7 +504,7 @@ void clocksource_resume(void) list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) - cs->resume(); + cs->resume(cs); clocksource_resume_watchdog(); } @@ -458,8 +513,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { @@ -467,66 +522,118 @@ void clocksource_touch_watchdog(void) } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred + * clocksource_max_adjustment- Returns max adjustment amount * @cs: Pointer to clocksource * */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more than 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters + */ +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which - * is equivalent to the below. - * max_cycles < (2^63)/cs->mult - * max_cycles < 2^(log2((2^63)/cs->mult)) - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) - * max_cycles < 2^(63 - log2(cs->mult)) - * max_cycles < 1 << (63 - log2(cs->mult)) + * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) + * which is equivalent to the below. + * max_cycles < (2^63)/(mult + maxadj) + * max_cycles < 2^(log2((2^63)/(mult + maxadj))) + * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) + * max_cycles < 2^(63 - log2(mult + maxadj)) + * max_cycles < 1 << (63 - log2(mult + maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. + * determined by the minimum of max_cycles and mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs; + + max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, + cs->mask); /* * To ensure that the clocksource does not wrap whilst we are idle, * limit the time the clocksource can be deferred by 12.5%. Please * note a margin of 12.5% is used because this can be computed with * a shift, versus say 10% which would require division. */ - return max_nsecs - (max_nsecs >> 5); + return max_nsecs - (max_nsecs >> 3); } -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { - struct clocksource *best, *cs; + struct clocksource *cs; if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + +static void __clocksource_select(bool skipcur) +{ + bool oneshot = tick_oneshot_mode_active(); + struct clocksource *best, *cs; + + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot, skipcur); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -534,8 +641,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " @@ -546,16 +652,35 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } -#else /* CONFIG_GENERIC_TIME */ +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif @@ -568,14 +693,13 @@ static inline void clocksource_select(void) { } */ static int __init clocksource_done_booting(void) { + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); finished_booting = 1; - /* * Run the watchdog first to eliminate unstable clock sources */ - clocksource_watchdog_kthread(NULL); - - mutex_lock(&clocksource_mutex); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -598,20 +722,106 @@ static void clocksource_enqueue(struct clocksource *cs) } /** + * __clocksource_updatefreq_scale - Used update clocksource with new freq + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * This should only be called from the clocksource->enable() method. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + */ +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + u64 sec; + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. We apply the same 12.5% + * margin as we do in clocksource_max_deferment() + */ + sec = (cs->mask - (cs->mask >> 3)); + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + + /* + * for clocksources that have large mults, to avoid overflow. + * Since mult may be adjusted by ntp, add an safety extra margin + * + */ + cs->maxadj = clocksource_max_adjustment(cs); + while ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult)) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + + cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Initialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + + /* Add clocksource to the clcoksource list */ + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + + +/** * clocksource_register - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * * Returns -EBUSY if registration fails, zero otherwise. */ int clocksource_register(struct clocksource *cs) { + /* calculate max adjustment for given mult/shift */ + cs->maxadj = clocksource_max_adjustment(cs); + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + /* calculate max idle time permitted for this clocksource */ cs->max_idle_ns = clocksource_max_deferment(cs); mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -622,30 +832,58 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - clocksource_select(); } /** * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating */ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); + clocksource_select(); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); @@ -653,13 +891,14 @@ EXPORT_SYMBOL(clocksource_unregister); /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { ssize_t count = 0; @@ -670,35 +909,44 @@ sysfs_show_current_clocksources(struct sys_device *dev, return count; } +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused + * @attr: unused * @buf: name of override clocksource * @count: length of buffer * * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + ssize_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = sysfs_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); @@ -706,15 +954,50 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, } /** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + ssize_t ret; + + ret = sysfs_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + +/** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { struct clocksource *src; @@ -743,35 +1026,41 @@ sysfs_show_available_clocksources(struct sys_device *dev, /* * Sysfs setup bits: */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + +static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = { .name = "clocksource", + .dev_name = "clocksource", }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = { .id = 0, - .cls = &clocksource_sysclass, + .bus = &clocksource_subsys, }; static int __init init_clocksource_sysfs(void) { - int error = sysdev_class_register(&clocksource_sysclass); + int error = subsys_system_register(&clocksource_subsys, NULL); if (!error) - error = sysdev_register(&device_clocksource); + error = device_register(&device_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_current_clocksource); + &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_available_clocksource); + &dev_attr_available_clocksource); return error; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a845690..a6a5bf53e86 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,8 +22,11 @@ ************************************************************************/ #include <linux/clocksource.h> #include <linux/jiffies.h> +#include <linux/module.h> #include <linux/init.h> +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as @@ -31,10 +34,10 @@ * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However @@ -48,14 +51,20 @@ * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else #define JIFFIES_SHIFT 8 +#endif static cycle_t jiffies_read(struct clocksource *cs) { return (cycle_t) jiffies; } -struct clocksource clocksource_jiffies = { +static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, @@ -64,6 +73,25 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&jiffies_lock); + ret = jiffies_64; + } while (read_seqretry(&jiffies_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); @@ -75,3 +103,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) { return &clocksource_jiffies; } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4800f933910..33db43a3951 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,22 +14,28 @@ #include <linux/timex.h> #include <linux/time.h> #include <linux/mm.h> +#include <linux/module.h> +#include <linux/rtc.h> + +#include "tick-internal.h" +#include "ntp_internal.h" /* * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. */ + /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; -u64 tick_length; +static u64 tick_length; static u64 tick_length_base; -static struct hrtimer leap_timer; - #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -46,10 +52,7 @@ static struct hrtimer leap_timer; static int time_state = TIME_OK; /* clock status bits: */ -int time_status = STA_UNSYNC; - -/* TAI offset (secs): */ -static long time_tai; +static int time_status = STA_UNSYNC; /* time adjustment (nsecs): */ static s64 time_offset; @@ -58,10 +61,10 @@ static s64 time_offset; static long time_constant = 2; /* maximum error (usecs): */ -long time_maxerror = NTP_PHASE_LIMIT; +static long time_maxerror = NTP_PHASE_LIMIT; /* estimated error (usecs): */ -long time_esterror = NTP_PHASE_LIMIT; +static long time_esterror = NTP_PHASE_LIMIT; /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; @@ -69,11 +72,174 @@ static s64 time_freq; /* time at last adjustment (secs): */ static long time_reftime; -long time_adjust; +static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + /* * NTP methods: */ @@ -116,7 +282,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs) time_status |= STA_MODE; - return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) @@ -142,17 +308,25 @@ static void ntp_update_offset(long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = xtime.tv_sec - time_reftime; + secs = get_seconds() - time_reftime; if (unlikely(time_status & STA_FREQHOLD)) secs = 0; - time_reftime = xtime.tv_sec; + time_reftime = get_seconds(); offset64 = offset; - freq_adj = (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + freq_adj = ntp_update_offset_fll(offset64, secs); - freq_adj += ntp_update_offset_fll(offset64, secs); + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); @@ -163,8 +337,6 @@ static void ntp_update_offset(long offset) /** * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock */ void ntp_clear(void) { @@ -177,63 +349,75 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); } + +u64 ntp_tick_length(void) +{ + return tick_length; +} + + /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs) { - enum hrtimer_restart res = HRTIMER_NORESTART; - - write_seqlock(&xtime_lock); + s64 delta; + int leap = 0; + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ switch (time_state) { case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; break; case TIME_INS: - timekeeping_leap_insert(-1); - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } break; case TIME_DEL: - timekeeping_leap_insert(1); - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { + leap = 1; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } break; case TIME_OOP: - time_tai++; time_state = TIME_WAIT; - /* fall through */ + break; + case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } - write_sequnlock(&xtime_lock); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -242,41 +426,40 @@ void second_overflow(void) time_status |= STA_UNSYNC; } - /* - * Compute the phase adjustment for the next second. The offset is - * reduced by a fixed factor times the time constant. - */ + /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; - delta = shift_right(time_offset, SHIFT_PLL + time_constant); + delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; + /* Check PPS signal */ + pps_dec_valid(); + if (!time_adjust) - return; + goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; - return; + goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; - return; + goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; -} - -#ifdef CONFIG_GENERIC_CMOS_UPDATE -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; +out: + return leap; +} +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -292,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... + * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* @@ -302,14 +486,26 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) - fail = update_persistent_clock(now); + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { + struct timespec adjust = now; + + fail = -ENODEV; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +#ifdef CONFIG_GENERIC_CMOS_UPDATE + fail = update_persistent_clock(adjust); +#endif +#ifdef CONFIG_RTC_SYSTOHC + if (fail == -ENODEV) + fail = rtc_set_ntp_time(adjust); +#endif + } next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; - if (!fail) + if (!fail || fail == -ENODEV) next.tv_sec = 659; else next.tv_sec = 0; @@ -318,40 +514,19 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { - if (!no_sync_cmos_clock) - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} /* * Propagate a new txc->status value into the NTP state: @@ -361,6 +536,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); } /* @@ -368,34 +545,17 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = xtime.tv_sec; + time_reftime = get_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } } -/* - * Called with the xtime lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + + +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec *ts, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); @@ -410,6 +570,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) @@ -427,7 +589,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts } if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; + *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); @@ -439,16 +601,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts ntp_update_frequency(); } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc) { - struct timespec ts; - int result; - - /* Validate the data before disabling interrupts */ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -460,7 +619,6 @@ int do_adjtimex(struct timex *txc) /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* * if the quartz is off by more than 10% then * something is VERY wrong! @@ -469,14 +627,22 @@ int do_adjtimex(struct timex *txc) (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); } - getnstimeofday(&ts); + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; + + return 0; +} + - write_seqlock_irq(&xtime_lock); +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ + int result; if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -491,7 +657,7 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts); + process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -500,7 +666,8 @@ int do_adjtimex(struct timex *txc) } result = time_state; /* mostly `TIME_OK' */ - if (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* check for errors */ + if (is_error_status(time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -512,33 +679,257 @@ int do_adjtimex(struct timex *txc) txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; - txc->tai = time_tai; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; + txc->tai = *time_tai; - write_sequnlock_irq(&xtime_lock); + /* fill PPS status fields */ + pps_fill_timex(txc); - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * __hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + + pts_norm = pps_normalize_ts(*phase_ts); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + +} +#endif /* CONFIG_NTP_PPS */ + static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; @@ -549,6 +940,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 00000000000..1950cb4ca2a --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 00000000000..ce033c7aa2e --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,446 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/export.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 00000000000..01d2d15aa66 --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,217 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/ktime.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/hrtimer.h> +#include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> + +struct clock_data { + ktime_t wrap_kt; + u64 epoch_ns; + u64 epoch_cyc; + seqcount_t seq; + unsigned long rate; + u32 mult; + u32 shift; + bool suspended; +}; + +static struct hrtimer sched_clock_timer; +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { + .mult = NSEC_PER_SEC / HZ, +}; + +static u64 __read_mostly sched_clock_mask; + +static u64 notrace jiffy_sched_clock_read(void) +{ + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); +} + +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +unsigned long long notrace sched_clock(void) +{ + u64 epoch_ns; + u64 epoch_cyc; + u64 cyc; + unsigned long seq; + + if (cd.suspended) + return cd.epoch_ns; + + do { + seq = raw_read_seqcount_begin(&cd.seq); + epoch_cyc = cd.epoch_cyc; + epoch_ns = cd.epoch_ns; + } while (read_seqcount_retry(&cd.seq, seq)); + + cyc = read_sched_clock(); + cyc = (cyc - epoch_cyc) & sched_clock_mask; + return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ + unsigned long flags; + u64 cyc; + u64 ns; + + cyc = read_sched_clock(); + ns = cd.epoch_ns + + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_local_irq_save(flags); + raw_write_seqcount_begin(&cd.seq); + cd.epoch_ns = ns; + cd.epoch_cyc = cyc; + raw_write_seqcount_end(&cd.seq); + raw_local_irq_restore(flags); +} + +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +{ + update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; +} + +void __init sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate) +{ + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; + unsigned long r; + char r_unit; + + if (cd.rate > rate) + return; + + WARN_ON(!irqs_disabled()); + + /* calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else + r_unit = ' '; + + /* calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, new_mult, new_shift); + + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); + + /* Enable IRQ time accounting if we have a fast enough sched_clock */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock function has been provided at that point, + * make it the final one one. + */ + if (read_sched_clock == jiffy_sched_clock_read) + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); +} + +static int sched_clock_suspend(void) +{ + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); + cd.suspended = true; + return 0; +} + +static void sched_clock_resume(void) +{ + cd.epoch_cyc = read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 00000000000..eb682d5c697 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5fc66..64c5990fd50 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,8 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> +#include <linux/smp.h> +#include <linux/module.h> #include "tick-internal.h" @@ -28,9 +29,9 @@ */ static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; +static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -50,7 +51,7 @@ struct tick_device *tick_get_broadcast_device(void) struct cpumask *tick_get_broadcast_mask(void) { - return to_cpumask(tick_broadcast_mask); + return tick_broadcast_mask; } /* @@ -65,18 +66,50 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) { - if ((tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev) +{ + struct clock_event_device *cur = tick_broadcast_device.evtdev; + + if (!tick_check_broadcast_device(cur, dev)) + return; - clockevents_exchange_device(NULL, dev); + if (!try_module_get(dev->owner)) + return; + + clockevents_exchange_device(cur, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); - return 1; + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); } /* @@ -87,14 +120,44 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + /* * Check, if the device is disfunctional and a place holder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret = 0; + int ret; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -106,26 +169,87 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + tick_device_setup_broadcast_func(dev); + cpumask_set_cpu(cpu, tick_broadcast_mask); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); ret = 1; } else { /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. + * Clear the broadcast bit for this cpu if the + * device is not power state affected. */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + else + tick_device_setup_broadcast_func(dev); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device. + */ + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + /* Nothing to do */ + ret = 0; + break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} +#endif + /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ @@ -161,13 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); - - raw_spin_unlock(&tick_broadcast_lock); + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); } /* @@ -177,28 +296,32 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ for (next = dev->next_event; ;) { next = ktime_add(next, tick_period); - if (!clockevents_program_event(dev, next, ktime_get())) - return; + if (!clockevents_program_event(dev, next, false)) + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -228,13 +351,13 @@ static void tick_do_broadcast_on_off(unsigned long *reason) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + bc_stopped = cpumask_empty(tick_broadcast_mask); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + cpumask_set_cpu(cpu, tick_broadcast_on); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -243,9 +366,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + if (tick_broadcast_force) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -253,7 +379,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; } - if (cpumask_empty(tick_get_broadcast_mask())) { + if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -302,10 +428,11 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) + if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } @@ -341,13 +468,14 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); + tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); + if (!cpumask_empty(tick_broadcast_mask)) + broadcast = tick_resume_broadcast_oneshot(bc); break; } } @@ -359,22 +487,58 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return to_cpumask(tick_broadcast_oneshot_mask); + return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires, int force) +{ + int ret; + + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return tick_dev_program_event(bc, expires, force); + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -387,12 +551,20 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ -void tick_check_oneshot_broadcast(int cpu) +void tick_check_oneshot_broadcast_this_cpu(void) { - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = &__get_cpu_var(tick_cpu_device); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_set_mode(td->evtdev, + CLOCK_EVT_MODE_ONESHOT); + } } } @@ -403,27 +575,52 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; - int cpu; + int cpu, next_cpu = 0; raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); + cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); - else if (td->evtdev->next_event.tv64 < next_event.tv64) + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } } /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + + /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(to_cpumask(tmpmask)); + tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -440,59 +637,176 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(next_event, 0)) + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; - int cpu; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + ktime_t now; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - goto out; + return 0; - bc = tick_broadcast_device.evtdev; + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return 0; + bc = tick_broadcast_device.evtdev; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + broadcast_shutdown_local(bc, dev); + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); } } - out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* @@ -502,7 +816,8 @@ out: */ static void tick_broadcast_clear_oneshot(int cpu) { - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -523,16 +838,13 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { + int cpu = smp_processor_id(); + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - int cpu = smp_processor_id(); bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - /* Take the do_timer update */ - tick_do_timer_cpu = cpu; /* * We must be careful here. There might be other CPUs @@ -540,18 +852,27 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); - - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { - tick_broadcast_init_next_event(to_cpumask(tmpmask), + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); + + if (was_periodic && !cpumask_empty(tmpmask)) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); } } @@ -569,6 +890,7 @@ void tick_broadcast_switch_to_oneshot(void) bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -584,10 +906,14 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + + broadcast_move_bc(cpu); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -600,4 +926,26 @@ int tick_broadcast_oneshot_active(void) return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + #endif + +void __init tick_broadcast_init(void) +{ + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eee..0a0608edeb2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,7 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -33,8 +33,22 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -49,9 +63,13 @@ struct tick_device *tick_get_device(int cpu) */ int tick_is_oneshot_available(void) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); } /* @@ -60,13 +78,14 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); + update_wall_time(); } update_process_times(user_mode(get_irq_regs())); @@ -79,19 +98,20 @@ static void tick_periodic(int cpu) void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); - ktime_t next; + ktime_t next = dev->next_event; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + + if (!clockevents_program_event(dev, next, false)) return; /* * Have to be careful here. If we're in oneshot mode, @@ -99,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev) * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing + * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); - next = ktime_add(next, tick_period); } } @@ -127,14 +146,14 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; next = ktime_add(next, tick_period); } @@ -160,7 +179,10 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); } @@ -188,7 +210,8 @@ static void tick_setup_device(struct tick_device *td, * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic - * way. + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; @@ -199,17 +222,75 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + /* - * Check, if the new registered device should be used. + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! */ -static int tick_check_new_device(struct clock_event_device *newdev) +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + +/* + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. + */ +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); + int cpu; cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -219,40 +300,15 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + if (!try_module_get(newdev->owner)) + return; /* * Replace the eventually existing device by the new @@ -267,20 +323,13 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; + tick_install_broadcast_device(newdev); } /* @@ -288,7 +337,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -305,13 +354,11 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -320,28 +367,23 @@ static void tick_shutdown(unsigned int *cpup) */ dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -350,67 +392,12 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); -} - -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); + tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f6..7ab92b19965 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,14 @@ /* * tick internal variable and functions used by low/high res code */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -12,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ @@ -22,30 +40,30 @@ extern void clockevents_shutdown(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); -extern int tick_dev_program_event(struct clock_event_device *dev, - ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast(int cpu); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -69,13 +87,14 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ /* @@ -83,21 +102,21 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) @@ -114,6 +133,9 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -132,3 +154,10 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + +#endif + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213016f..824109060a3 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,58 +18,17 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" /** - * tick_program_event internal worker function - */ -int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, - int force) -{ - ktime_t now = ktime_get(); - int i; - - for (i = 0;;) { - int ret = clockevents_program_event(dev, expires, now); - - if (!ret || !force) - return ret; - - /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it - * and emit a warning. - */ - if (++i > 2) { - /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns << 1); - - i = 0; - } - - now = ktime_get(); - expires = ktime_add_ns(now, dev->min_delta_ns); - } -} - -/** * tick_program_event */ int tick_program_event(ktime_t expires, int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return tick_dev_program_event(dev, expires, force); + return clockevents_program_event(dev, expires, force); } /** @@ -77,11 +36,10 @@ int tick_program_event(ktime_t expires, int force) */ void tick_resume_oneshot(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_program_event(ktime_get(), 1); + clockevents_program_event(dev, ktime_get(), true); } /** @@ -93,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - tick_dev_program_event(newdev, next_event, 1); + clockevents_program_event(newdev, next_event, true); } /** @@ -139,7 +97,7 @@ int tick_oneshot_mode_active(void) int ret; local_irq_save(flags); - ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; local_irq_restore(flags); return ret; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f992762d7f5..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,20 +19,25 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> +#include <linux/context_tracking.h> #include <asm/irq_regs.h> #include "tick-internal.h" +#include <trace/events/timer.h> + /* * Per cpu nohz control structure */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock. */ static ktime_t last_jiffies_update; @@ -50,14 +55,14 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta; /* - * Do a quick check without holding xtime_lock: + * Do a quick check without holding jiffies_lock: */ delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + /* Reevalute with jiffies_lock held */ + write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -79,8 +84,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; } - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); + update_wall_time(); } /* @@ -90,24 +99,274 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); return period; } + +static void tick_sched_do_timer(ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * jiffies_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + if (is_idle_task(current)) + ts->idle_jiffies++; + } +#endif + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; + +static bool can_stop_full_tick(void) +{ + WARN_ON_ONCE(!irqs_disabled()); + + if (!sched_can_stop_tick()) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return false; + } + + if (!posix_cpu_timers_can_stop_tick(current)) { + trace_tick_stop(0, "posix timers running\n"); + return false; + } + + if (!perf_event_can_stop_tick()) { + trace_tick_stop(0, "perf events running\n"); + return false; + } + + /* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + /* + * TODO: kick full dynticks CPUs when + * sched_clock_stable is set. + */ + if (!sched_clock_stable()) { + trace_tick_stop(0, "unstable sched clock\n"); + /* + * Don't allow the user to think they can get + * full NO_HZ with this machine. + */ + WARN_ONCE(tick_nohz_full_running, + "NO_HZ FULL will not work with unstable sched clock"); + return false; + } +#endif + + return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void __tick_nohz_full_check(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (tick_nohz_full_cpu(smp_processor_id())) { + if (ts->tick_stopped && !is_idle_task(current)) { + if (!can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); + } + } +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ + __tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ + if (tick_nohz_full_cpu(smp_processor_id())) + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ + __tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ + if (!tick_nohz_full_running) + return; + + preempt_disable(); + smp_call_function_many(tick_nohz_full_mask, + nohz_full_kick_ipi, NULL, false); + tick_nohz_full_kick(); + preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void __tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + +out: + local_irq_restore(flags); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ + int cpu; + + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + if (cpulist_parse(str, tick_nohz_full_mask) < 0) { + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + return 1; + } + + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + tick_nohz_full_running = true; + + return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* + * If we handle the timekeeping duty for full dynticks CPUs, + * we can't safely shutdown that CPU. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ + int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL + if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); + return err; + } + err = 0; + cpumask_setall(tick_nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + tick_nohz_full_running = true; +#endif + return err; +} + +void __init tick_nohz_init(void) +{ + int cpu; + + if (!tick_nohz_full_running) { + if (tick_nohz_init_all() < 0) + return; + } + + for_each_cpu(cpu, tick_nohz_full_mask) + context_tracking_cpu_set(cpu); + + cpu_notifier(tick_nohz_cpu_down_callback, 0); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#endif + /* * NOHZ - aka dynamic tick functionality */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; - +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -136,12 +395,9 @@ __setup("nohz=", setup_tick_nohz); */ static void tick_nohz_update_jiffies(ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - cpumask_clear_cpu(cpu, nohz_cpu_mask); - ts->idle_waketime = now; + __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); @@ -150,14 +406,31 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog(); } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +/* + * Updates the per cpu time idle statistics counters + */ +static void +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta; - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + if (ts->idle_active) { + delta = ktime_sub(now, ts->idle_entrytime); + if (nr_iowait_cpu(cpu) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_entrytime = now; + } + + if (last_update_time) + *last_update_time = ktime_to_us(now); + +} + +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) +{ + update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); @@ -165,126 +438,132 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now, delta; + ktime_t now = ktime_get(); - now = ktime_get(); - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - } ts->idle_entrytime = now; ts->idle_active = 1; sched_clock_idle_sleep_event(); return now; } +/** + * get_cpu_idle_time_us - get the total idle time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cummulative idle time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; - if (ts->idle_active) - *last_update_time = ktime_to_us(ts->idle_lastupdate); - else - *last_update_time = ktime_to_us(ktime_get()); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + idle = ts->idle_sleeptime; + } else { + if (ts->idle_active && !nr_iowait_cpu(cpu)) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(ts->idle_sleeptime, delta); + } else { + idle = ts->idle_sleeptime; + } + } + + return ktime_to_us(idle); - return ktime_to_us(ts->idle_sleeptime); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * get_cpu_iowait_time_us - get the total iowait time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. + * Return the cummulative iowait time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. */ -void tick_nohz_stop_sched_tick(int inidle) +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; - ktime_t last_update, expires, now; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - u64 time_delta; - int cpu; - - local_irq_save(flags); - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - - /* - * Call to tick_nohz_start_idle stops the last_update_time from being - * updated. Thus, it must not be called in the event we are called from - * irq_exit() with the prior state different than idle. - */ - if (!inidle && !ts->inidle) - goto end; + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, iowait; - /* - * Set ts->inidle unconditionally. Even if the system did not - * switch to NOHZ mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; + if (!tick_nohz_active) + return -1; - now = tick_nohz_start_idle(ts); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + iowait = ts->iowait_sleeptime; + } else { + if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + iowait = ktime_add(ts->iowait_sleeptime, delta); + } else { + iowait = ts->iowait_sleeptime; + } } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; - - if (need_resched()) - goto end; + return ktime_to_us(iowait); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + ktime_t last_update, expires, ret = { .tv64 = 0 }; + unsigned long rcu_delta_jiffies; + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + u64 time_delta; - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - goto end; - } + time_delta = timekeeping_max_deferment(); - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; - time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || + arch_needs_cpu(cpu) || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } + /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu + * Do not stop the tick, if we are only one off (or less) + * or if the cpu is required for RCU: */ - if (!ts->tick_stopped && delta_jiffies == 1) + if (!ts->tick_stopped && delta_jiffies <= 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ @@ -313,6 +592,13 @@ void tick_nohz_stop_sched_tick(int inidle) time_delta = KTIME_MAX; } +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -337,13 +623,12 @@ void tick_nohz_stop_sched_tick(int inidle) else expires.tv64 = KTIME_MAX; - if (delta_jiffies > 1) - cpumask_set_cpu(cpu, nohz_cpu_mask); - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -352,25 +637,14 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); + trace_tick_stop(1, " "); } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -395,15 +669,162 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -end: - local_irq_restore(flags); + + return ret; +} + +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!can_stop_full_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; + return false; + } + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + if (tick_nohz_full_enabled()) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + + return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now, expires; + int cpu = smp_processor_id(); + + now = tick_nohz_start_idle(ts); + + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * + * The arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + /* + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ + set_cpu_sd_state_idle(); + + local_irq_disable(); + + ts = &__get_cpu_var(tick_cpu_sched); + ts->inidle = 1; + __tick_nohz_idle_enter(ts); + + local_irq_enable(); +} +EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (ts->inidle) + __tick_nohz_idle_enter(ts); + else + tick_nohz_full_stop_tick(ts); } /** @@ -421,7 +842,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ @@ -438,49 +859,36 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_get_expires(&ts->sched_timer), 0)) break; } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); + /* Reread time and update jiffies */ now = ktime_get(); + tick_do_update_jiffies64(now); } } -/** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - */ -void tick_nohz_restart_sched_tick(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif - ktime_t now; - - local_irq_disable(); - if (ts->idle_active || (ts->inidle && ts->tick_stopped)) - now = ktime_get(); - - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - - if (!ts->inidle || !ts->tick_stopped) { - ts->inidle = 0; - local_irq_enable(); - return; - } + /* Update jiffies first */ + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); - ts->inidle = 0; + calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; - rcu_exit_nohz(); + tick_nohz_restart(ts, now); +} - /* Update jiffies first */ - select_nohz_load_balancer(0); - tick_do_update_jiffies64(now); - cpumask_clear_cpu(cpu, nohz_cpu_mask); +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + unsigned long ticks; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING + if (vtime_accounting_enabled()) + return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -493,18 +901,40 @@ void tick_nohz_restart_sched_tick(void) if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); #endif +} - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now; - tick_nohz_restart(ts, now); + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); + + if (ts->tick_stopped) { + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); + } local_irq_enable(); } +EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { @@ -519,40 +949,12 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); + tick_sched_do_timer(now); + tick_sched_handle(ts, regs); while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); @@ -576,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -594,9 +996,6 @@ static void tick_nohz_switch_to_nohz(void) next = ktime_add(next, tick_period); } local_irq_enable(); - - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); } /* @@ -610,12 +1009,10 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ - - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta; /* @@ -630,36 +1027,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now) #endif } -static inline void tick_check_nohz(int cpu) +static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(cpu, now); + tick_nohz_kick_tick(ts, now); } } #else static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_nohz_irq_enter(void) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ -void tick_check_idle(int cpu) +void tick_irq_enter(void) { - tick_check_oneshot_broadcast(cpu); - tick_check_nohz(cpu); + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); } /* @@ -668,7 +1065,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { @@ -676,50 +1073,31 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } + if (regs) + tick_sched_handle(ts, regs); hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -727,7 +1105,6 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); - u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -737,10 +1114,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - hrtimer_add_expires_ns(&ts->sched_timer, offset); + + /* Offset the tick to avert jiffies_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); @@ -752,14 +1133,16 @@ void tick_setup_sched_timer(void) now = ktime_get(); } -#ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) +#ifdef CONFIG_NO_HZ_COMMON + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -769,7 +1152,7 @@ void tick_cancel_sched_timer(int cpu) hrtimer_cancel(&ts->sched_timer); # endif - ts->nohz_mode = NOHZ_MODE_INACTIVE; + memset(ts, 0, sizeof(*ts)); } #endif diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index 12f5c55090b..00000000000 --- a/kernel/time/timecompare.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly <patrick.ohly@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/timecompare.h> -#include <linux/module.h> -#include <linux/math64.h> - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp) -{ - u64 nsec; - - nsec = source_tstamp + sync->offset; - nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / - TIMECOMPARE_SKEW_RESOLUTION; - - return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp) -{ - u64 start_source = 0, end_source = 0; - struct { - s64 offset; - s64 duration_target; - } buffer[10], sample, *samples; - int counter = 0, i; - int used; - int index; - int num_samples = sync->num_samples; - - if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { - samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); - if (!samples) { - samples = buffer; - num_samples = sizeof(buffer)/sizeof(buffer[0]); - } - } else { - samples = buffer; - } - - /* run until we have enough valid samples, but do not try forever */ - i = 0; - counter = 0; - while (1) { - u64 ts; - ktime_t start, end; - - start = sync->target(); - ts = timecounter_read(sync->source); - end = sync->target(); - - if (!i) - start_source = ts; - - /* ignore negative durations */ - sample.duration_target = ktime_to_ns(ktime_sub(end, start)); - if (sample.duration_target >= 0) { - /* - * assume symetric delay to and from source: - * average target time corresponds to measured - * source time - */ - sample.offset = - (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - - ts; - - /* simple insertion sort based on duration */ - index = counter - 1; - while (index >= 0) { - if (samples[index].duration_target < - sample.duration_target) - break; - samples[index + 1] = samples[index]; - index--; - } - samples[index + 1] = sample; - counter++; - } - - i++; - if (counter >= num_samples || i >= 100000) { - end_source = ts; - break; - } - } - - *source_tstamp = (end_source + start_source) / 2; - - /* remove outliers by only using 75% of the samples */ - used = counter * 3 / 4; - if (!used) - used = counter; - if (used) { - /* calculate average */ - s64 off = 0; - for (index = 0; index < used; index++) - off += samples[index].offset; - *offset = div_s64(off, used); - } - - if (samples && samples != buffer) - kfree(samples); - - return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - s64 offset; - u64 average_time; - - if (!timecompare_offset(sync, &offset, &average_time)) - return; - - if (!sync->last_update) { - sync->last_update = average_time; - sync->offset = offset; - sync->skew = 0; - } else { - s64 delta_nsec = average_time - sync->last_update; - - /* avoid division by negative or small deltas */ - if (delta_nsec >= 10000) { - s64 delta_offset_nsec = offset - sync->offset; - s64 skew; /* delta_offset_nsec * - TIMECOMPARE_SKEW_RESOLUTION / - delta_nsec */ - u64 divisor; - - /* div_s64() is limited to 32 bit divisor */ - skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; - divisor = delta_nsec; - while (unlikely(divisor >= ((s64)1) << 32)) { - /* divide both by 2; beware, right shift - of negative value has undefined - behavior and can only be used for - the positive divisor */ - skew = div_s64(skew, 2); - divisor >>= 1; - } - skew = div_s64(skew, divisor); - - /* - * Calculate new overall skew as 4/16 the - * old value and 12/16 the new one. This is - * a rather arbitrary tradeoff between - * only using the latest measurement (0/16 and - * 16/16) and even more weight on past measurements. - */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 - sync->skew = - div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * - sync->skew + - TIMECOMPARE_NEW_SKEW_PER_16 * skew, - 16); - sync->last_update = average_time; - sync->offset = offset; - } - } -} -EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4..32d8d6aaedb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,50 +8,92 @@ * */ +#include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/tick.h> #include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> +#include <linux/compiler.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* The shift value of the current clocksource. */ - int shift; - - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ - u64 xtime_nsec; - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - int ntp_error_shift; - /* NTP adjusted clock multiplier */ - u32 mult; -}; +#include "tick-internal.h" +#include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) + +static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; -struct timekeeper timekeeper; +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + +/* Flag for if there is a persistent clock on this platform */ +bool __read_mostly persistent_clock_exist = false; + +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { + tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + tk->xtime_sec++; + } +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ + struct timespec tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec_to_ktime(tmp); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ + /* Verify consistency before modifying */ + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + + tk->total_sleep_time = t; + tk->offs_boot = timespec_to_ktime(t); +} /** - * timekeeper_setup_internals - Set up internals to use clocksource clock. + * tk_setup_internals - Set up internals to use clocksource clock. * + * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment @@ -59,128 +101,165 @@ struct timekeeper timekeeper; * * Unless you're the timekeeping code, you should not be using this! */ -static void timekeeper_setup_internals(struct clocksource *clock) +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { cycle_t interval; - u64 tmp; + u64 tmp, ntpinterval; + struct clocksource *old_clock; - timekeeper.clock = clock; - clock->cycle_last = clock->read(clock); + old_clock = tk->clock; + tk->clock = clock; + tk->cycle_last = clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; + ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (cycle_t) tmp; - timekeeper.cycle_interval = interval; + tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - timekeeper.xtime_interval = (u64) interval * clock->mult; - timekeeper.raw_interval = + tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = ((u64) interval * clock->mult) >> clock->shift; - timekeeper.xtime_nsec = 0; - timekeeper.shift = clock->shift; + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) + tk->xtime_nsec >>= -shift_change; + else + tk->xtime_nsec <<= shift_change; + } + tk->shift = clock->shift; - timekeeper.ntp_error = 0; - timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - timekeeper.mult = clock->mult; + tk->mult = clock->mult; } /* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ + if (likely(arch_gettimeoffset)) + return arch_gettimeoffset(); + return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif + +static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsec = cycle_delta * tk->mult + tk->xtime_nsec; + nsec >>= tk->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } -static inline s64 timekeeping_get_ns_raw(void) +static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + /* convert delta to nanoseconds. */ + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); +} -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. - */ -struct timespec raw_time; + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + update_pvclock_gtod(tk, true); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; } +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold xtime_lock */ -void timekeeping_leap_insert(int leapsecond) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, unsigned int action) { - xtime.tv_sec += leapsecond; - wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); -} + if (action & TK_CLEAR_NTP) { + tk->ntp_error = 0; + ntp_clear(); + } + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); -#ifdef CONFIG_GENERIC_TIME + if (action & TK_MIRROR) + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); +} /** * timekeeping_forward_now - update clock to the current time @@ -189,72 +268,88 @@ void timekeeping_leap_insert(int leapsecond) * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(void) +static void timekeeping_forward_now(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + tk->xtime_nsec += cycle_delta * tk->mult; - /* If arch requires, add in gettimeoffset() */ - nsec += arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; - timespec_add_ns(&xtime, nsec); + tk_normalize_xtime(tk); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&raw_time, nsec); + timespec_add_ns(&tk->raw_time, nsec); } /** - * getnstimeofday - Returns the time of day in a timespec + * __getnstimeofday - Returns the time of day in a timespec. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -void getnstimeofday(struct timespec *ts) +int __getnstimeofday(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; - s64 nsecs; - - WARN_ON(timekeeping_suspended); + s64 nsecs = 0; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqcount_begin(&timekeeper_seq); - *ts = xtime; - nsecs = timekeeping_get_ns(); + ts->tv_sec = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); + ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); + + /* + * Do not bail out early, in case there were callers still using + * the value, even in the face of the WARN_ON. + */ + if (unlikely(timekeeping_suspended)) + return -EAGAIN; + return 0; } +EXPORT_SYMBOL(__getnstimeofday); +/** + * getnstimeofday - Returns the time of day in a timespec. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec (WARN if suspended). + */ +void getnstimeofday(struct timespec *ts) +{ + WARN_ON(__getnstimeofday(ts)); +} EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { + struct timekeeper *tk = &timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); - secs = xtime.tv_sec + wall_to_monotonic.tv_sec; - nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; - nsecs += timekeeping_get_ns(); + seq = read_seqcount_begin(&timekeeper_seq); + secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -273,25 +368,109 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono; + s64 nsec; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); - *ts = xtime; - tomono = wall_to_monotonic; - nsecs = timekeeping_get_ns(); + seq = read_seqcount_begin(&timekeeper_seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec + nsecs); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + ts->tv_sec = tk->xtime_sec + tk->tai_offset; + nsecs = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + ts->tv_nsec = 0; + timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ + struct timespec ts; + + timekeeping_clocktai(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + *ts_raw = tk->raw_time; + ts_real->tv_sec = tk->xtime_sec; + ts_real->tv_nsec = 0; + + nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_real = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -306,130 +485,196 @@ void do_gettimeofday(struct timeval *tv) tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } - EXPORT_SYMBOL(do_gettimeofday); + /** * do_settimeofday - Sets the time of day * @tv: pointer to the timespec variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { - struct timespec ts_delta; + struct timekeeper *tk = &timekeeper; + struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); - - timekeeping_forward_now(); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); - ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); + timekeeping_forward_now(tk); - xtime = *tv; + xt = tk_xtime(tk); + ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - update_xtime_cache(0); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); - timekeeper.ntp_error = 0; - ntp_clear(); + tk_set_xtime(tk, tv); - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); return 0; } - EXPORT_SYMBOL(do_settimeofday); /** - * change_clocksource - Swaps clocksources if a new one is available + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset * - * Accumulates current time interval and initializes new clocksource + * Adds or subtracts an offset value from the current time. */ -static int change_clocksource(void *data) +int timekeeping_inject_offset(struct timespec *ts) { - struct clocksource *new, *old; + struct timekeeper *tk = &timekeeper; + unsigned long flags; + struct timespec tmp; + int ret = 0; - new = (struct clocksource *) data; + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + timekeeping_forward_now(tk); - timekeeping_forward_now(); - if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - timekeeper_setup_internals(new); - if (old->disable) - old->disable(old); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid_strict(&tmp)) { + ret = -EINVAL; + goto error; } - return 0; + + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return ret; } +EXPORT_SYMBOL(timekeeping_inject_offset); + /** - * timekeeping_notify - Install a new clock source - * @clock: pointer to the clock source + * timekeeping_get_tai_offset - Returns current TAI offset from UTC * - * This function is called from clocksource.c after a new, better clock - * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +s32 timekeeping_get_tai_offset(void) { - if (timekeeper.clock == clock) - return; - stop_machine(change_clocksource, clock, NULL); - tick_clock_notify(); -} + struct timekeeper *tk = &timekeeper; + unsigned int seq; + s32 ret; -#else /* GENERIC_TIME */ + do { + seq = read_seqcount_begin(&timekeeper_seq); + ret = tk->tai_offset; + } while (read_seqcount_retry(&timekeeper_seq, seq)); -static inline void timekeeping_forward_now(void) { } + return ret; +} /** - * ktime_get - get the monotonic time in ktime_t format + * __timekeeping_set_tai_offset - Lock free worker function * - * returns the time in ktime_t format */ -ktime_t ktime_get(void) +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { - struct timespec now; + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); +} - ktime_get_ts(&now); +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; - return timespec_to_ktime(now); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); } -EXPORT_SYMBOL_GPL(ktime_get); /** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable + * change_clocksource - Swaps clocksources if a new one is available * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. + * Accumulates current time interval and initializes new clocksource */ -void ktime_get_ts(struct timespec *ts) +static int change_clocksource(void *data) { - struct timespec tomono; - unsigned long seq; + struct timekeeper *tk = &timekeeper; + struct clocksource *new, *old; + unsigned long flags; - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; + new = (struct clocksource *) data; - } while (read_seqretry(&xtime_lock, seq)); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); + timekeeping_forward_now(tk); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } + } + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return 0; } -EXPORT_SYMBOL_GPL(ktime_get_ts); -#endif /* !GENERIC_TIME */ +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +int timekeeping_notify(struct clocksource *clock) +{ + struct timekeeper *tk = &timekeeper; + + if (tk->clock == clock) + return 0; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); + return tk->clock == clock ? 0 : -1; +} /** * ktime_get_real - get the real (wall-) time in ktime_t format @@ -454,48 +699,57 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs; do { - seq = read_seqbegin(&xtime_lock); - nsecs = timekeeping_get_ns_raw(); - *ts = raw_time; + seq = read_seqcount_begin(&timekeeper_seq); + nsecs = timekeeping_get_ns_raw(tk); + *ts = tk->raw_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; int ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqcount_begin(&timekeeper_seq); - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change! */ u64 timekeeping_max_deferment(void) { - return timekeeper.clock->max_idle_ns; + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 ret; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + ret = tk->clock->max_idle_ns; + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + return ret; } /** @@ -507,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -522,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -533,126 +787,278 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { + struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot; + struct timespec now, boot, tmp; read_persistent_clock(&now); - read_boot_clock(&boot); - write_seqlock_irqsave(&xtime_lock, flags); + if (!timespec_valid_strict(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } else if (now.tv_sec || now.tv_nsec) + persistent_clock_exist = true; + read_boot_clock(&boot); + if (!timespec_valid_strict(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - timekeeper_setup_internals(clock); - - xtime.tv_sec = now.tv_sec; - xtime.tv_nsec = now.tv_nsec; - raw_time.tv_sec = 0; - raw_time.tv_nsec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = xtime.tv_sec; - boot.tv_nsec = xtime.tv_nsec; - } - set_normalized_timespec(&wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); - total_sleep_time.tv_sec = 0; - total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + tk_setup_internals(tk, clock); + + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) + boot = tk_xtime(tk); + + set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(tk, tmp); + + tmp.tv_sec = 0; + tmp.tv_nsec = 0; + tk_set_sleep_time(tk, tmp); + + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; /** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + struct timespec *delta) +{ + if (!timespec_valid_strict(delta)) { + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); +} + +/** + * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec delta value + * + * This hook is for architectures that cannot support read_persistent_clock + * because their RTC/persistent clock is only accessible when irqs are enabled. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime(struct timespec *delta) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + + /* + * Make sure we don't set the clock twice, as timekeeping_resume() + * already did it + */ + if (has_persistent_clock()) + return; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + timekeeping_forward_now(tk); + + __timekeeping_inject_sleeptime(tk, delta); + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); +} + +/** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { + struct timekeeper *tk = &timekeeper; + struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts; + struct timespec ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + bool suspendtime_found = false; - read_persistent_clock(&ts); + read_persistent_clock(&ts_new); + clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add_safe(xtime, ts); - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add_safe(total_sleep_time, ts); + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = clock->read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > clock->cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec(nsec); + suspendtime_found = true; + } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + suspendtime_found = true; } - update_xtime_cache(0); - /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; + + if (suspendtime_found) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + tk->cycle_last = clock->cycle_last = cycle_now; + tk->ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); /* Resume hrtimers */ - hres_timers_resume(); - - return 0; + hrtimers_resume(); } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec delta, delta_delta; + static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_forward_now(); + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + timekeeping_forward_now(tk); timekeeping_suspended = 1; - write_sequnlock_irqrestore(&xtime_lock, flags); + + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occured and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec_add(timekeeping_suspend_time, delta_delta); + } + + timekeeping_update(tk, TK_MIRROR); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + clocksource_suspend(); + clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, + s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -668,7 +1074,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -677,8 +1083,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); - tick_error -= timekeeper.xtime_interval >> 1; + tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); + tick_error -= tk->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -703,36 +1109,175 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void timekeeping_adjust(s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 error, interval = timekeeper.cycle_interval; + s64 error, interval = tk->cycle_interval; int adj; - error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); + /* + * The point of this is to check if the error is greater than half + * an interval. + * + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. + * + * Note we subtract one in the shift, so that error is really error*2. + * This "saves" dividing(shifting) interval twice, but keeps the + * (error > interval) comparison as still measuring if error is + * larger than half an interval. + * + * Note: It does not "save" on aggravation when reading the code. + */ + error = tk->ntp_error >> (tk->ntp_error_shift - 1); if (error > interval) { + /* + * We now divide error by 4(via shift), which checks if + * the error is greater than twice the interval. + * If it is greater, we need a bigadjust, if its smaller, + * we can adjust by 1. + */ error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else - return; + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } else { + if (error < -interval) { + /* See comment above, this is just switched for the negative */ + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else { + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } + } else { + goto out_adjust; + } + } + + if (unlikely(tk->clock->maxadj && + (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { + printk_deferred_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->clock->name, (long)tk->mult + adj, + (long)tk->clock->mult + tk->clock->maxadj); + } + /* + * So the following can be confusing. + * + * To keep things simple, lets assume adj == 1 for now. + * + * When adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + * + * XXX - TODO: Doc ntp_error calculation. + */ + tk->mult += adj; + tk->xtime_interval += interval; + tk->xtime_nsec -= offset; + tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; + +out_adjust: + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we already accumulated the second, cannot simply roll + * the accumulated second back, since the NTP subsystem has been + * notified via second_overflow. So instead we push xtime_nsec forward + * by the amount we underflowed, and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)tk->xtime_nsec < 0)) { + s64 neg = -(s64)tk->xtime_nsec; + tk->xtime_nsec = 0; + tk->ntp_error += neg << tk->ntp_error_shift; + } - timekeeper.mult += adj; - timekeeper.xtime_interval += interval; - timekeeper.xtime_nsec -= offset; - timekeeper.ntp_error -= (interval - offset) << - timekeeper.ntp_error_shift; } +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + unsigned int clock_set = 0; + + while (tk->xtime_nsec >= nsecps) { + int leap; + + tk->xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + if (unlikely(leap)) { + struct timespec ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec_sub(tk->wall_to_monotonic, ts)); + + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + + clock_set = TK_CLOCK_WAS_SET; + } + } + return clock_set; +} /** * logarithmic_accumulation - shifted accumulation of cycles @@ -743,128 +1288,164 @@ static void timekeeping_adjust(s64 offset) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, + u32 shift, + unsigned int *clock_set) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + cycle_t interval = tk->cycle_interval << shift; + u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < timekeeper.cycle_interval<<shift) + if (offset < interval) return offset; /* Accumulate one shifted interval */ - offset -= timekeeper.cycle_interval << shift; - timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; - - timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - while (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - /* Accumulate into raw time */ - raw_time.tv_nsec += timekeeper.raw_interval << shift;; - while (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; + offset -= interval; + tk->cycle_last += interval; + + tk->xtime_nsec += tk->xtime_interval << shift; + *clock_set |= accumulate_nsecs_to_secs(tk); + + /* Accumulate raw time */ + raw_nsecs = (u64)tk->raw_interval << shift; + raw_nsecs += tk->raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; + raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); + tk->raw_time.tv_sec += raw_secs; } + tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length << shift; - timekeeper.ntp_error -= timekeeper.xtime_interval << - (timekeeper.ntp_error_shift + shift); + tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); return offset; } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + + /** * update_wall_time - Uses the current clocksource to increment the wall time * - * Called from the timer interrupt, must hold a write on xtime_lock. */ void update_wall_time(void) { struct clocksource *clock; + struct timekeeper *real_tk = &timekeeper; + struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; - u64 nsecs; int shift = 0, maxshift; + unsigned int clock_set = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - return; + goto out; - clock = timekeeper.clock; -#ifdef CONFIG_GENERIC_TIME - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + clock = real_tk->clock; + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET + offset = real_tk->cycle_interval; #else - offset = timekeeper.cycle_interval; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + + /* Check if there's really nothing to do */ + if (offset < real_tk->cycle_interval) + goto out; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals - * that is smaller then the offset. We then accumulate that + * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); - /* Bound shift to one less then what overflows tick_length */ - maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(offset, shift); - shift--; + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); + if (offset < tk->cycle_interval<<shift) + shift--; } /* correct the clock when NTP error is too big */ - timekeeping_adjust(offset); + timekeeping_adjust(tk, offset); /* - * Since in the loop above, we accumulate any amount of time - * in xtime_nsec over a second into xtime.tv_sec, its possible for - * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in timekeeping_adjust(), - * its possible the required corrective factor to xtime_nsec could - * cause it to underflow. - * - * Now, we cannot simply roll the accumulated second back, since - * the NTP subsystem has been notified via second_overflow. So - * instead we push xtime_nsec forward by the amount we underflowed, - * and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. + * XXX This can be killed once everyone converts + * to the new update_vsyscall. */ - if (unlikely((s64)timekeeper.xtime_nsec < 0)) { - s64 neg = -(s64)timekeeper.xtime_nsec; - timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; - } + old_vsyscall_fixup(tk); - /* store full nanoseconds into xtime after rounding it up and - * add the remainder to the error difference. + /* + * Finally, make sure that after the rounding + * xtime_nsec isn't larger than NSEC_PER_SEC */ - xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - timekeeper.ntp_error_shift; + clock_set |= accumulate_nsecs_to_secs(tk); - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); - - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + write_seqcount_begin(&timekeeper_seq); + /* Update clock->cycle_last with the new value */ + clock->cycle_last = tk->cycle_last; + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the timekeeper_seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, clock_set); + write_seqcount_end(&timekeeper_seq); +out: + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); } /** * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -873,13 +1454,67 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec boottime = { - .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, - .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + .tv_sec = tk->wall_to_monotonic.tv_sec + + tk->total_sleep_time.tv_sec, + .tv_nsec = tk->wall_to_monotonic.tv_nsec + + tk->total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + struct timespec tomono, sleep; + s64 nsec; + unsigned int seq; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; + sleep = tk->total_sleep_time; + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + ts->tv_sec += tomono.tv_sec + sleep.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -887,30 +1522,38 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add_safe(*ts, total_sleep_time); + struct timekeeper *tk = &timekeeper; + + *ts = timespec_add(*ts, tk->total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + struct timekeeper *tk = &timekeeper; + + return tk->xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + struct timekeeper *tk = &timekeeper; + + return tk_xtime(tk); } struct timespec current_kernel_time(void) { + struct timekeeper *tk = &timekeeper; struct timespec now; unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqcount_begin(&timekeeper_seq); - now = xtime_cache; - } while (read_seqretry(&xtime_lock, seq)); + now = tk_xtime(tk); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return now; } @@ -918,17 +1561,184 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { + struct timekeeper *tk = &timekeeper; struct timespec now, mono; unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqcount_begin(&timekeeper_seq); - now = xtime_cache; - mono = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqcount_retry(&timekeeper_seq, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); return now; } + +/* + * Must hold jiffies_lock + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + *xtim = tk_xtime(tk); + *wtom = tk->wall_to_monotonic; + *sleep = tk->total_sleep_time; + } while (read_seqcount_retry(&timekeeper_seq, seq)); +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interrupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) +{ + struct timekeeper *tk = &timekeeper; + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + secs = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + +/** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + struct timespec wtom; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + wtom = tk->wall_to_monotonic; + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + return timespec_to_ktime(wtom); +} +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + struct timespec ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + getnstimeofday(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + } + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + if (tai != orig_tai) + clock_was_set(); + + ntp_notify_cmos_timer(); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&jiffies_lock); + do_timer(ticks); + write_sequnlock(&jiffies_lock); + update_wall_time(); +} diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 00000000000..4d54f97558d --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,74 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/time.h> + +#include "timekeeping_internal.h" + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 00000000000..13323ea08ff --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd1050..61ed862cdd3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@ #include <asm/uaccess.h> + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -41,7 +48,7 @@ static void print_name_offset(struct seq_file *m, void *sym) char symname[KSYM_NAME_LEN]; if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%p>", sym); + SEQ_printf(m, "<%pK>", sym); else SEQ_printf(m, "%s", symname); } @@ -79,26 +86,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, { struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct rb_node *curr; + struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = base->first; + curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = rb_next(curr); + curr = timerqueue_iterate_next(curr); i++; } if (curr) { - timer = rb_entry(curr, struct hrtimer, node); + timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); @@ -112,7 +119,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %p\n", base); + SEQ_printf(m, " .base: %pK\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -167,7 +173,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -176,6 +182,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_waketime); P_ns(idle_exittime); P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); P(last_jiffies); P(next_jiffies); P_ns(idle_expires); @@ -186,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -194,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -228,12 +235,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -244,47 +251,111 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + static int timer_list_show(struct seq_file *m, void *v) { + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.5\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + timer_list_header(NULL, now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(NULL, cpu, now); - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} - return 0; +static void *move_iter(struct timer_list_iter *iter, loff_t offset) +{ + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + } + return iter; } -void sysrq_timer_list_show(void) +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + ++*offset; + return move_iter(iter, 1); +} + +static void timer_list_stop(struct seq_file *seq, void *v) { - timer_list_show(NULL, NULL); } +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7..1fb08f21302 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, unsigned int timer_flag) { /* - * It doesnt matter which lock we take: + * It doesn't matter which lock we take: */ raw_spinlock_t *lock; struct entry *entry, input; @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.2\n"); + seq_puts(m, "Timer Stats Version: v0.3\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); + seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); + seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { |
