diff options
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 4 | ||||
| -rw-r--r-- | kernel/time/Makefile | 5 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 24 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 117 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 52 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 6 | ||||
| -rw-r--r-- | kernel/time/ntp.c | 46 | ||||
| -rw-r--r-- | kernel/time/sched_clock.c | 145 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 106 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 93 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 34 | ||||
| -rw-r--r-- | kernel/time/tick-internal.h | 18 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 87 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 73 | ||||
| -rw-r--r-- | kernel/time/timekeeping_debug.c | 2 | ||||
| -rw-r--r-- | kernel/time/timer_stats.c | 8 |
16 files changed, 563 insertions, 257 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9e..f448513a45e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -100,7 +100,7 @@ config NO_HZ_FULL # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency - depends on 64BIT + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU @@ -124,7 +124,7 @@ config NO_HZ_FULL endchoice config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default" + bool "Full dynticks system on all CPUs by default (except CPU 0)" depends on NO_HZ_FULL help If the user doesn't pass the nohz_full boot option to diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f..57a413fd0eb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e..fe75444ae7e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; return hrtimer_get_res(baseid, tp); } @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; *tp = ktime_to_timespec(base->gettime()); return 0; @@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { + ktime_t exp; + if (!rtcdev) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (old_setting) alarm_timer_get(timr, old_setting); @@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - alarm_start(&timr->it.alarm.alarmtimer, - timespec_to_ktime(new_setting->it_value)); + exp = timespec_to_ktime(new_setting->it_value); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now; + + now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); + exp = ktime_add(now, exp); + } + + alarm_start(&timr->it.alarm.alarmtimer, exp); return 0; } @@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (!capable(CAP_WAKE_ALARM)) return -EPERM; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c86678..9c94c19f130 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind { int res; }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -111,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); dev->next_event.tv64 = KTIME_MAX; return -ETIME; } @@ -124,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); return 0; } @@ -380,8 +417,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** @@ -404,6 +441,19 @@ void clockevents_config_and_register(struct clock_event_device *dev, } EXPORT_SYMBOL_GPL(clockevents_config_and_register); +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + + return 0; +} + /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify @@ -411,17 +461,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register); * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + unsigned long flags; + int ret; - return clockevents_program_event(dev, dev->next_event, false); + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; } /* @@ -489,12 +544,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -507,7 +563,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -550,6 +606,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); @@ -584,7 +641,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, const char *buf, size_t count) { char name[CS_NAME_LEN]; - size_t ret = sysfs_get_uname(buf, name, count); + ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce; if (ret < 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f..ba3e502c955 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource - * + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters */ -static u64 clocksource_max_deferment(struct clocksource *cs) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) * which is equivalent to the below. - * max_cycles < (2^63)/(cs->mult + cs->maxadj) - * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) - * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) - * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) - * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) + * max_cycles < (2^63)/(mult + maxadj) + * max_cycles < 2^(log2((2^63)/(mult + maxadj))) + * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) + * max_cycles < 2^(63 - log2(mult + maxadj)) + * max_cycles < 1 << (63 - log2(mult + maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); + max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. + * determined by the minimum of max_cycles and mask. * Note: Here we subtract the maxadj to make sure we don't sleep for * too long if there's a large negative adjustment. */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, - cs->shift); + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs; + max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, + cs->mask); /* * To ensure that the clocksource does not wrap whilst we are idle, * limit the time the clocksource can be deferred by 12.5%. Please @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret; + ssize_t ret; mutex_lock(&clocksource_mutex); @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, { struct clocksource *cs; char name[CS_NAME_LEN]; - size_t ret; + ssize_t ret; ret = sysfs_get_uname(buf, name, count); if (ret < 0) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7a925ba456f..a6a5bf53e86 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -51,7 +51,13 @@ * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else #define JIFFIES_SHIFT 8 +#endif static cycle_t jiffies_read(struct clocksource *cs) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577..33db43a3951 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... + * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { struct timespec adjust = now; fail = -ENODEV; @@ -513,16 +514,17 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif @@ -687,8 +689,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } @@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22b..01d2d15aa66 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,25 +8,28 @@ #include <linux/clocksource.h> #include <linux/init.h> #include <linux/jiffies.h> +#include <linux/ktime.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/syscore_ops.h> -#include <linux/timer.h> +#include <linux/hrtimer.h> #include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> struct clock_data { + ktime_t wrap_kt; u64 epoch_ns; - u32 epoch_cyc; - u32 epoch_cyc_copy; + u64 epoch_cyc; + seqcount_t seq; unsigned long rate; u32 mult; u32 shift; bool suspended; }; -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); @@ -35,42 +38,39 @@ static struct clock_data cd = { .mult = NSEC_PER_SEC / HZ, }; -static u32 __read_mostly sched_clock_mask = 0xffffffff; +static u64 __read_mostly sched_clock_mask; -static u32 notrace jiffy_sched_clock_read(void) +static u64 notrace jiffy_sched_clock_read(void) { - return (u32)(jiffies - INITIAL_JIFFIES); + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } -static unsigned long long notrace sched_clock_32(void) +unsigned long long notrace sched_clock(void) { u64 epoch_ns; - u32 epoch_cyc; - u32 cyc; + u64 epoch_cyc; + u64 cyc; + unsigned long seq; if (cd.suspended) return cd.epoch_ns; - /* - * Load the epoch_cyc and epoch_ns atomically. We do this by - * ensuring that we always write epoch_cyc, epoch_ns and - * epoch_cyc_copy in strict order, and read them in strict order. - * If epoch_cyc and epoch_cyc_copy are not equal, then we're in - * the middle of an update, and we should repeat the load. - */ do { + seq = raw_read_seqcount_begin(&cd.seq); epoch_cyc = cd.epoch_cyc; - smp_rmb(); epoch_ns = cd.epoch_ns; - smp_rmb(); - } while (epoch_cyc != cd.epoch_cyc_copy); + } while (read_seqcount_retry(&cd.seq, seq)); cyc = read_sched_clock(); cyc = (cyc - epoch_cyc) & sched_clock_mask; @@ -83,49 +83,68 @@ static unsigned long long notrace sched_clock_32(void) static void notrace update_sched_clock(void) { unsigned long flags; - u32 cyc; + u64 cyc; u64 ns; cyc = read_sched_clock(); ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift); - /* - * Write epoch_cyc and epoch_ns in a way that the update is - * detectable in cyc_to_fixed_sched_clock(). - */ + raw_local_irq_save(flags); - cd.epoch_cyc_copy = cyc; - smp_wmb(); + raw_write_seqcount_begin(&cd.seq); cd.epoch_ns = ns; - smp_wmb(); cd.epoch_cyc = cyc; + raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } -static void sched_clock_poll(unsigned long wrap_ticks) +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { - mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +void __init sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate) { - unsigned long r, w; - u64 res, wrap; + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; + unsigned long r; char r_unit; if (cd.rate > rate) return; - BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = (1ULL << bits) - 1; - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -137,27 +156,11 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); - do_div(wrap, NSEC_PER_MSEC); - w = wrap; - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", - bits, r, r_unit, res, w); + res = cyc_to_ns(1ULL, new_mult, new_shift); - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) @@ -166,13 +169,6 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) -{ - return sched_clock_func(); -} - void __init sched_clock_postinit(void) { /* @@ -180,14 +176,23 @@ void __init sched_clock_postinit(void) * make it the final one one. */ if (read_sched_clock == jiffy_sched_clock_read) - setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + + update_sched_clock(); - sched_clock_poll(sched_clock_timer.data); + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } static int sched_clock_suspend(void) { - sched_clock_poll(sched_clock_timer.data); + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); cd.suspended = true; return 0; } @@ -195,7 +200,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); - cd.epoch_cyc_copy = cd.epoch_cyc; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); cd.suspended = false; } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 00000000000..eb682d5c697 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fe..64c5990fd50 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; @@ -119,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); @@ -271,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); tick_do_broadcast(tmpmask); - - raw_spin_unlock(&tick_broadcast_lock); } /* @@ -286,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have @@ -305,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, false)) - return; + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -537,10 +551,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ -void tick_check_oneshot_broadcast(int cpu) +void tick_check_oneshot_broadcast_this_cpu(void) { - if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = &__get_cpu_var(tick_cpu_device); /* * We might be in the middle of switching over from @@ -629,24 +643,61 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -657,7 +708,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -665,7 +716,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -678,6 +729,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -745,6 +806,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* @@ -755,6 +817,7 @@ out: static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -850,6 +913,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0..0a0608edeb2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* @@ -70,6 +85,7 @@ static void tick_periodic(int cpu) do_timer(1); write_sequnlock(&jiffies_lock); + update_wall_time(); } update_process_times(user_mode(get_irq_regs())); @@ -82,18 +98,19 @@ static void tick_periodic(int cpu) void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); - ktime_t next; + ktime_t next = dev->next_event; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + if (!clockevents_program_event(dev, next, false)) return; /* @@ -102,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev) * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing + * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); - next = ktime_add(next, tick_period); } } @@ -260,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev) { - if (tick_check_percpu(curdev, newdev, smp_processor_id())) + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) return false; return tick_check_preferred(curdev, newdev); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709..7ab92b19965 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); -extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* * NO_HZ / high resolution timer shared code @@ -46,23 +46,23 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast(int cpu); +extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { @@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ @@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -152,6 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f83..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -84,8 +84,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; } write_sequnlock(&jiffies_lock); + update_wall_time(); } /* @@ -177,7 +181,7 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) { + if (!sched_clock_stable()) { trace_tick_stop(0, "unstable sched clock\n"); /* * Don't allow the user to think they can get @@ -361,8 +365,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -391,11 +395,9 @@ __setup("nohz=", setup_tick_nohz); */ static void tick_nohz_update_jiffies(ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ts->idle_waketime = now; + __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); @@ -426,17 +428,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - update_ts_time_stats(cpu, ts, now, NULL); + update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { ktime_t now = ktime_get(); @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -536,12 +536,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; + time_delta = timekeeping_max_deferment(); + /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; - time_delta = timekeeping_max_deferment(); } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || @@ -681,18 +682,18 @@ out: static void tick_nohz_full_stop_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) - return; + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; - if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) - return; + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; - if (!can_stop_full_tick()) - return; + if (!can_stop_full_tick()) + return; - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); #endif } @@ -711,8 +712,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; @@ -752,7 +755,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu, ts); + now = tick_nohz_start_idle(ts); if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; @@ -799,11 +802,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -914,8 +912,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -928,7 +925,7 @@ void tick_nohz_idle_exit(void) now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); @@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1012,12 +1009,10 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ - - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta; /* @@ -1032,36 +1027,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now) #endif } -static inline void tick_check_nohz(int cpu) +static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(cpu, now); + tick_nohz_kick_tick(ts, now); } } #else static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_nohz_irq_enter(void) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ -void tick_check_idle(int cpu) +void tick_irq_enter(void) { - tick_check_oneshot_broadcast(cpu); - tick_check_nohz(cpu); + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); } /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc..32d8d6aaedb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include <linux/compiler.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -77,7 +78,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -90,8 +91,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) } /** - * timekeeper_setup_internals - Set up internals to use clocksource clock. + * tk_setup_internals - Set up internals to use clocksource clock. * + * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment @@ -595,7 +597,7 @@ s32 timekeeping_get_tai_offset(void) static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /** @@ -610,6 +612,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); @@ -758,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -773,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -849,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1023,6 +1027,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } + + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1130,16 +1136,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * we can adjust by 1. */ error >>= 2; - /* - * XXX - In update_wall_time, we round up to the next - * nanosecond, and store the amount rounded up into - * the error. This causes the likely below to be unlikely. - * - * The proper fix is to avoid rounding up by using - * the high precision tk->xtime_nsec instead of - * xtime.tv_nsec everywhere. Fixing this will take some - * time. - */ if (likely(error <= interval)) adj = 1; else @@ -1162,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); @@ -1255,7 +1251,7 @@ out_adjust: static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; - unsigned int action = 0; + unsigned int clock_set = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1277,11 +1273,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); - clock_was_set_delayed(); - action = TK_CLOCK_WAS_SET; + clock_set = TK_CLOCK_WAS_SET; } } - return action; + return clock_set; } /** @@ -1294,7 +1289,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * Returns the unconsumed cycles. */ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, - u32 shift) + u32 shift, + unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; @@ -1308,7 +1304,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; - accumulate_nsecs_to_secs(tk); + *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ raw_nsecs = (u64)tk->raw_interval << shift; @@ -1347,7 +1343,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) @@ -1359,14 +1355,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * update_wall_time - Uses the current clocksource to increment the wall time * */ -static void update_wall_time(void) +void update_wall_time(void) { struct clocksource *clock; struct timekeeper *real_tk = &timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; - unsigned int action; + unsigned int clock_set = 0; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1401,7 +1397,8 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift); + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -1419,7 +1416,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - action = accumulate_nsecs_to_secs(tk); + clock_set |= accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1435,10 +1432,13 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, action); + timekeeping_update(real_tk, clock_set); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); } /** @@ -1583,7 +1583,6 @@ struct timespec get_monotonic_coarse(void) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_wall_time(); calc_global_load(ticks); } @@ -1613,9 +1612,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * ktime_get_update_offsets - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets - * Called from hrtimer_interupt() or retrigger_next_event() + * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) @@ -1697,12 +1697,16 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); - update_pvclock_gtod(tk, true); - clock_was_set_delayed(); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (tai != orig_tai) + clock_was_set(); + + ntp_notify_cmos_timer(); + return ret; } @@ -1736,4 +1740,5 @@ void xtime_update(unsigned long ticks) write_seqlock(&jiffies_lock); do_timer(ticks); write_sequnlock(&jiffies_lock); + update_wall_time(); } diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 802433a4f5e..4d54f97558d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -21,6 +21,8 @@ #include <linux/seq_file.h> #include <linux/time.h> +#include "timekeeping_internal.h" + static unsigned int sleep_time_bin[32] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b55..1fb08f21302 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.2\n"); + seq_puts(m, "Timer Stats Version: v0.3\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); + seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); + seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { |
