diff options
Diffstat (limited to 'arch/powerpc/kernel/time.c')
| -rw-r--r-- | arch/powerpc/kernel/time.c | 969 |
1 files changed, 443 insertions, 526 deletions
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 3b26fbd6bec..9fff9cdcc51 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -17,8 +17,7 @@ * * TODO (not necessarily in this file): * - improve precision and reproducibility of timebase frequency - * measurement at boot time. (for iSeries, we calibrate the timebase - * against the Titan chip's clock.) + * measurement at boot time. * - for astronomical applications: add a new function to get * non ambiguous timestamps even around leap seconds. This needs * a new timestamp format and a good name. @@ -33,7 +32,7 @@ */ #include <linux/errno.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/param.h> @@ -43,6 +42,7 @@ #include <linux/timex.h> #include <linux/kernel_stat.h> #include <linux/time.h> +#include <linux/clockchips.h> #include <linux/init.h> #include <linux/profile.h> #include <linux/cpu.h> @@ -52,6 +52,9 @@ #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/irq.h> +#include <linux/delay.h> +#include <linux/irq_work.h> +#include <asm/trace.h> #include <asm/io.h> #include <asm/processor.h> @@ -67,35 +70,27 @@ #include <asm/vdso_datapage.h> #include <asm/firmware.h> #include <asm/cputime.h> -#ifdef CONFIG_PPC_ISERIES -#include <asm/iseries/it_lp_queue.h> -#include <asm/iseries/hv_call_xm.h> -#endif /* powerpc clocksource/clockevent code */ #include <linux/clockchips.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h> -static cycle_t rtc_read(void); +static cycle_t rtc_read(struct clocksource *); static struct clocksource clocksource_rtc = { .name = "rtc", .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = rtc_read, }; -static cycle_t timebase_read(void); +static cycle_t timebase_read(struct clocksource *); static struct clocksource clocksource_timebase = { .name = "timebase", .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = timebase_read, }; @@ -106,31 +101,18 @@ static int decrementer_set_next_event(unsigned long evt, static void decrementer_set_mode(enum clock_event_mode mode, struct clock_event_device *dev); -static struct clock_event_device decrementer_clockevent = { - .name = "decrementer", - .rating = 200, - .shift = 16, - .mult = 0, /* To be filled in */ - .irq = 0, - .set_next_event = decrementer_set_next_event, - .set_mode = decrementer_set_mode, - .features = CLOCK_EVT_FEAT_ONESHOT, +struct clock_event_device decrementer_clockevent = { + .name = "decrementer", + .rating = 200, + .irq = 0, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP, }; +EXPORT_SYMBOL(decrementer_clockevent); -struct decrementer_clock { - struct clock_event_device event; - u64 next_tb; -}; - -static DEFINE_PER_CPU(struct decrementer_clock, decrementers); - -#ifdef CONFIG_PPC_ISERIES -static unsigned long __initdata iSeries_recal_titan; -static signed long __initdata iSeries_recal_tb; - -/* Forward declaration is only needed for iSereis compiles */ -void __init clocksource_init(void); -#endif +DEFINE_PER_CPU(u64, decrementers_next_tb); +static DEFINE_PER_CPU(struct clock_event_device, decrementers); #define XSEC_PER_SEC (1024*1024) @@ -146,46 +128,32 @@ unsigned long tb_ticks_per_usec = 100; /* sane default */ EXPORT_SYMBOL(tb_ticks_per_usec); unsigned long tb_ticks_per_sec; EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ -u64 tb_to_xs; -unsigned tb_to_us; - -#define TICKLEN_SCALE TICK_LENGTH_SHIFT -u64 last_tick_len; /* units are ns / 2^TICKLEN_SCALE */ -u64 ticklen_to_xs; /* 0.64 fraction */ - -/* If last_tick_len corresponds to about 1/HZ seconds, then - last_tick_len << TICKLEN_SHIFT will be about 2^63. */ -#define TICKLEN_SHIFT (63 - 30 - TICKLEN_SCALE + SHIFT_HZ) DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); static u64 tb_to_ns_scale __read_mostly; static unsigned tb_to_ns_shift __read_mostly; -static unsigned long boot_tb __read_mostly; - -struct gettimeofday_struct do_gtod; +static u64 boot_tb __read_mostly; extern struct timezone sys_tz; static long timezone_offset; unsigned long ppc_proc_freq; -EXPORT_SYMBOL(ppc_proc_freq); +EXPORT_SYMBOL_GPL(ppc_proc_freq); unsigned long ppc_tb_freq; +EXPORT_SYMBOL_GPL(ppc_tb_freq); -static u64 tb_last_jiffy __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(u64, last_jiffy); - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Factors for converting from cputime_t (timebase ticks) to - * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds). + * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). * These are all stored as 0.64 fixed-point binary fractions. */ u64 __cputime_jiffies_factor; EXPORT_SYMBOL(__cputime_jiffies_factor); -u64 __cputime_msec_factor; -EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; @@ -193,14 +161,18 @@ EXPORT_SYMBOL(__cputime_clockt_factor); DEFINE_PER_CPU(unsigned long, cputime_last_delta); DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); +cputime_t cputime_one_jiffy; + +void (*dtl_consumer)(struct dtl_entry *, u64); + static void calc_cputime_factors(void) { struct div_result res; div128_by_32(HZ, 0, tb_ticks_per_sec, &res); __cputime_jiffies_factor = res.result_low; - div128_by_32(1000, 0, tb_ticks_per_sec, &res); - __cputime_msec_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; div128_by_32(1, 0, tb_ticks_per_sec, &res); __cputime_sec_factor = res.result_low; div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); @@ -208,185 +180,209 @@ static void calc_cputime_factors(void) } /* - * Read the PURR on systems that have it, otherwise the timebase. + * Read the SPURR on systems that have it, otherwise the PURR, + * or if that doesn't exist return the timebase value passed in. */ -static u64 read_purr(void) +static u64 read_spurr(u64 tb) { + if (cpu_has_feature(CPU_FTR_SPURR)) + return mfspr(SPRN_SPURR); if (cpu_has_feature(CPU_FTR_PURR)) return mfspr(SPRN_PURR); - return mftb(); + return tb; } -/* - * Read the SPURR on systems that have it, otherwise the purr - */ -static u64 read_spurr(u64 purr) -{ - /* - * cpus without PURR won't have a SPURR - * We already know the former when we use this, so tell gcc - */ - if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR)) - return mfspr(SPRN_SPURR); - return purr; -} +#ifdef CONFIG_PPC_SPLPAR /* - * Account time for a transition between system, hard irq - * or soft irq state. + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. */ -void account_system_vtime(struct task_struct *tsk) +static u64 scan_dispatch_log(u64 stop_tb) { - u64 now, nowscaled, delta, deltascaled, sys_time; - unsigned long flags; + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; - local_irq_save(flags); - now = read_purr(); - nowscaled = read_spurr(now); - delta = now - get_paca()->startpurr; - deltascaled = nowscaled - get_paca()->startspurr; - get_paca()->startpurr = now; - get_paca()->startspurr = nowscaled; - if (!in_interrupt()) { - /* deltascaled includes both user and system time. - * Hence scale it based on the purr ratio to estimate - * the system time */ - sys_time = get_paca()->system_time; - if (get_paca()->user_time) - deltascaled = deltascaled * sys_time / - (sys_time + get_paca()->user_time); - delta += sys_time; - get_paca()->system_time = 0; + if (i == be64_to_cpu(vpa->dtl_idx)) + return 0; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; + if (dtl_consumer) + dtl_consumer(dtl, i); + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; } - account_system_time(tsk, 0, delta); - account_system_time_scaled(tsk, deltascaled); - per_cpu(cputime_last_delta, smp_processor_id()) = delta; - per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; - local_irq_restore(flags); + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; } /* - * Transfer the user and system times accumulated in the paca - * by the exception entry and exit code to the generic process - * user and system time records. - * Must be called with interrupts disabled. + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. */ -void account_process_tick(struct task_struct *tsk, int user_tick) +void accumulate_stolen_time(void) { - cputime_t utime, utimescaled; + u64 sst, ust; - utime = get_paca()->user_time; - get_paca()->user_time = 0; - account_user_time(tsk, utime); + u8 save_soft_enabled = local_paca->soft_enabled; - utimescaled = cputime_to_scaled(utime); - account_user_time_scaled(tsk, utimescaled); -} + /* We are called early in the exception entry, before + * soft/hard_enabled are sync'ed to the expected state + * for the exception. We are hard disabled but the PACA + * needs to reflect that so various debug stuff doesn't + * complain + */ + local_paca->soft_enabled = 0; -/* - * Stuff for accounting stolen time. - */ -struct cpu_purr_data { - int initialized; /* thread is running */ - u64 tb; /* last TB value read */ - u64 purr; /* last PURR value read */ - u64 spurr; /* last SPURR value read */ -}; + sst = scan_dispatch_log(local_paca->starttime_user); + ust = scan_dispatch_log(local_paca->starttime); + local_paca->system_time -= sst; + local_paca->user_time -= ust; + local_paca->stolen_time += ust + sst; -/* - * Each entry in the cpu_purr_data array is manipulated only by its - * "owner" cpu -- usually in the timer interrupt but also occasionally - * in process context for cpu online. As long as cpus do not touch - * each others' cpu_purr_data, disabling local interrupts is - * sufficient to serialize accesses. - */ -static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data); + local_paca->soft_enabled = save_soft_enabled; +} -static void snapshot_tb_and_purr(void *data) +static inline u64 calculate_stolen_time(u64 stop_tb) { - unsigned long flags; - struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data); - - local_irq_save(flags); - p->tb = get_tb_or_rtc(); - p->purr = mfspr(SPRN_PURR); - wmb(); - p->initialized = 1; - local_irq_restore(flags); + u64 stolen = 0; + + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { + stolen = scan_dispatch_log(stop_tb); + get_paca()->system_time -= stolen; + } + + stolen += get_paca()->stolen_time; + get_paca()->stolen_time = 0; + return stolen; } -/* - * Called during boot when all cpus have come up. - */ -void snapshot_timebases(void) +#else /* CONFIG_PPC_SPLPAR */ +static inline u64 calculate_stolen_time(u64 stop_tb) { - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1); + return 0; } +#endif /* CONFIG_PPC_SPLPAR */ + /* - * Must be called with interrupts disabled. + * Account time for a transition between system, hard irq + * or soft irq state. */ -void calculate_steal_time(void) +static u64 vtime_delta(struct task_struct *tsk, + u64 *sys_scaled, u64 *stolen) { - u64 tb, purr; - s64 stolen; - struct cpu_purr_data *pme; - - pme = &__get_cpu_var(cpu_purr_data); - if (!pme->initialized) - return; /* !CPU_FTR_PURR or early in early boot */ - tb = mftb(); - purr = mfspr(SPRN_PURR); - stolen = (tb - pme->tb) - (purr - pme->purr); - if (stolen > 0) - account_steal_time(current, stolen); - pme->tb = tb; - pme->purr = purr; + u64 now, nowscaled, deltascaled; + u64 udelta, delta, user_scaled; + + WARN_ON_ONCE(!irqs_disabled()); + + now = mftb(); + nowscaled = read_spurr(now); + get_paca()->system_time += now - get_paca()->starttime; + get_paca()->starttime = now; + deltascaled = nowscaled - get_paca()->startspurr; + get_paca()->startspurr = nowscaled; + + *stolen = calculate_stolen_time(now); + + delta = get_paca()->system_time; + get_paca()->system_time = 0; + udelta = get_paca()->user_time - get_paca()->utime_sspurr; + get_paca()->utime_sspurr = get_paca()->user_time; + + /* + * Because we don't read the SPURR on every kernel entry/exit, + * deltascaled includes both user and system SPURR ticks. + * Apportion these ticks to system SPURR ticks and user + * SPURR ticks in the same ratio as the system time (delta) + * and user time (udelta) values obtained from the timebase + * over the same interval. The system ticks get accounted here; + * the user ticks get saved up in paca->user_time_scaled to be + * used by account_process_tick. + */ + *sys_scaled = delta; + user_scaled = udelta; + if (deltascaled != delta + udelta) { + if (udelta) { + *sys_scaled = deltascaled * delta / (delta + udelta); + user_scaled = deltascaled - *sys_scaled; + } else { + *sys_scaled = deltascaled; + } + } + get_paca()->user_time_scaled += user_scaled; + + return delta; } -#ifdef CONFIG_PPC_SPLPAR -/* - * Must be called before the cpu is added to the online map when - * a cpu is being brought up at runtime. - */ -static void snapshot_purr(void) +void vtime_account_system(struct task_struct *tsk) { - struct cpu_purr_data *pme; - unsigned long flags; + u64 delta, sys_scaled, stolen; - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - local_irq_save(flags); - pme = &__get_cpu_var(cpu_purr_data); - pme->tb = mftb(); - pme->purr = mfspr(SPRN_PURR); - pme->initialized = 1; - local_irq_restore(flags); + delta = vtime_delta(tsk, &sys_scaled, &stolen); + account_system_time(tsk, 0, delta, sys_scaled); + if (stolen) + account_steal_time(stolen); } +EXPORT_SYMBOL_GPL(vtime_account_system); -#endif /* CONFIG_PPC_SPLPAR */ - -#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ -#define calc_cputime_factors() -#define calculate_steal_time() do { } while (0) -#endif +void vtime_account_idle(struct task_struct *tsk) +{ + u64 delta, sys_scaled, stolen; -#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)) -#define snapshot_purr() do { } while (0) -#endif + delta = vtime_delta(tsk, &sys_scaled, &stolen); + account_idle_time(delta + stolen); +} /* - * Called when a cpu comes up after the system has finished booting, - * i.e. as a result of a hotplug cpu action. + * Transfer the user time accumulated in the paca + * by the exception entry and exit code to the generic + * process user time records. + * Must be called with interrupts disabled. + * Assumes that vtime_account_system/idle() has been called + * recently (i.e. since the last entry from usermode) so that + * get_paca()->user_time_scaled is up to date. */ -void snapshot_timebase(void) +void vtime_account_user(struct task_struct *tsk) { - __get_cpu_var(last_jiffy) = get_tb_or_rtc(); - snapshot_purr(); + cputime_t utime, utimescaled; + + utime = get_paca()->user_time; + utimescaled = get_paca()->user_time_scaled; + get_paca()->user_time = 0; + get_paca()->user_time_scaled = 0; + get_paca()->utime_sspurr = 0; + account_user_time(tsk, utime, utimescaled); } +#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +#define calc_cputime_factors() +#endif + void __delay(unsigned long loops) { unsigned long start; @@ -415,51 +411,6 @@ void udelay(unsigned long usecs) } EXPORT_SYMBOL(udelay); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) -{ - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. - */ - vdso_data->tb_orig_stamp = new_tb_stamp; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - #ifdef CONFIG_SMP unsigned long profile_pc(struct pt_regs *regs) { @@ -473,87 +424,101 @@ unsigned long profile_pc(struct pt_regs *regs) EXPORT_SYMBOL(profile_pc); #endif -#ifdef CONFIG_PPC_ISERIES +#ifdef CONFIG_IRQ_WORK -/* - * This function recalibrates the timebase based on the 49-bit time-of-day - * value in the Titan chip. The Titan is much more accurate than the value - * returned by the service processor for the timebase frequency. +/* + * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ - -static int __init iSeries_tb_recal(void) +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) { - struct div_result divres; - unsigned long titan, tb; + unsigned long x; - /* Make sure we only run on iSeries */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - return -ENODEV; + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; +} - tb = get_tb(); - titan = HvCallXm_loadTod(); - if ( iSeries_recal_titan ) { - unsigned long tb_ticks = tb - iSeries_recal_tb; - unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12; - unsigned long new_tb_ticks_per_sec = (tb_ticks * USEC_PER_SEC)/titan_usec; - unsigned long new_tb_ticks_per_jiffy = (new_tb_ticks_per_sec+(HZ/2))/HZ; - long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy; - char sign = '+'; - /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */ - new_tb_ticks_per_sec = new_tb_ticks_per_jiffy * HZ; - - if ( tick_diff < 0 ) { - tick_diff = -tick_diff; - sign = '-'; - } - if ( tick_diff ) { - if ( tick_diff < tb_ticks_per_jiffy/25 ) { - printk( "Titan recalibrate: new tb_ticks_per_jiffy = %lu (%c%ld)\n", - new_tb_ticks_per_jiffy, sign, tick_diff ); - tb_ticks_per_jiffy = new_tb_ticks_per_jiffy; - tb_ticks_per_sec = new_tb_ticks_per_sec; - calc_cputime_factors(); - div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->tb_to_xs = tb_to_xs; - } - else { - printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" - " new tb_ticks_per_jiffy = %lu\n" - " old tb_ticks_per_jiffy = %lu\n", - new_tb_ticks_per_jiffy, tb_ticks_per_jiffy ); - } - } - } - iSeries_recal_titan = titan; - iSeries_recal_tb = tb; +static inline void set_irq_work_pending_flag(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (1), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} - /* Called here as now we know accurate values for the timebase */ - clocksource_init(); - return 0; +static inline void clear_irq_work_pending(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (0), + "i" (offsetof(struct paca_struct, irq_work_pending))); } -late_initcall(iSeries_tb_recal); -/* Called from platform early init */ -void __init iSeries_time_init_early(void) +#else /* 32-bit */ + +DEFINE_PER_CPU(u8, irq_work_pending); + +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 + +#endif /* 32 vs 64 bit */ + +void arch_irq_work_raise(void) { - iSeries_recal_tb = get_tb(); - iSeries_recal_titan = HvCallXm_loadTod(); + preempt_disable(); + set_irq_work_pending_flag(); + set_dec(1); + preempt_enable(); } -#endif /* CONFIG_PPC_ISERIES */ -/* - * For iSeries shared processors, we have to let the hypervisor - * set the hardware decrementer. We set a virtual decrementer - * in the lppaca and call the hypervisor if the virtual - * decrementer is less than the current value in the hardware - * decrementer. (almost always the new decrementer value will - * be greater than the current hardware decementer so the hypervisor - * call will not be needed) - */ +#else /* CONFIG_IRQ_WORK */ + +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() + +#endif /* CONFIG_IRQ_WORK */ + +void __timer_interrupt(void) +{ + struct pt_regs *regs = get_irq_regs(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + struct clock_event_device *evt = &__get_cpu_var(decrementers); + u64 now; + + trace_timer_interrupt_entry(regs); + + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); + } + + now = get_tb_or_rtc(); + if (now >= *next_tb) { + *next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); + __get_cpu_var(irq_stat).timer_irqs_event++; + } else { + now = *next_tb - now; + if (now <= DECREMENTER_MAX) + set_dec((int)now); + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); + __get_cpu_var(irq_stat).timer_irqs_others++; + } + +#ifdef CONFIG_PPC64 + /* collect purr register values often, for accurate calculations */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); + cu->current_tb = mfspr(SPRN_PURR); + } +#endif + + trace_timer_interrupt_exit(regs); +} /* * timer_interrupt - gets called when the decrementer overflows, @@ -562,93 +527,67 @@ void __init iSeries_time_init_early(void) void timer_interrupt(struct pt_regs * regs) { struct pt_regs *old_regs; - struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); - struct clock_event_device *evt = &decrementer->event; - u64 now; + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); /* Ensure a positive value is written to the decrementer, or else - * some CPUs will continuue to take decrementer exceptions */ + * some CPUs will continue to take decrementer exceptions. + */ set_dec(DECREMENTER_MAX); -#ifdef CONFIG_PPC32 - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); -#endif - - now = get_tb_or_rtc(); - if (now < decrementer->next_tb) { - /* not time for this event yet */ - now = decrementer->next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); + /* Some implementations of hotplug will get timer interrupts while + * offline, just ignore these and we also need to set + * decrementers_next_tb as MAX to make sure __check_irq_replay + * don't replay timer interrupt when return, otherwise we'll trap + * here infinitely :( + */ + if (!cpu_online(smp_processor_id())) { + *next_tb = ~(u64)0; return; } - old_regs = set_irq_regs(regs); - irq_enter(); - - calculate_steal_time(); -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES)) - get_lppaca()->int_dword.fields.decr_int = 0; -#endif + /* Conditionally hard-enable interrupts now that the DEC has been + * bumped to its maximum value + */ + may_hard_irq_enable(); - if (evt->event_handler) - evt->event_handler(evt); -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) - process_hvlpevents(); +#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); #endif -#ifdef CONFIG_PPC64 - /* collect purr register values often, for accurate calculations */ - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { - struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); - cu->current_tb = mfspr(SPRN_PURR); - } -#endif + old_regs = set_irq_regs(regs); + irq_enter(); + __timer_interrupt(); irq_exit(); set_irq_regs(old_regs); } -void wakeup_decrementer(void) +/* + * Hypervisor decrementer interrupts shouldn't occur but are sometimes + * left pending on exit from a KVM guest. We don't need to do anything + * to clear them, as they are edge-triggered. + */ +void hdec_interrupt(struct pt_regs *regs) { - unsigned long ticks; - - /* - * The timebase gets saved on sleep and restored on wakeup, - * so all we need to do is to reset the decrementer. - */ - ticks = tb_ticks_since(__get_cpu_var(last_jiffy)); - if (ticks < tb_ticks_per_jiffy) - ticks = tb_ticks_per_jiffy - ticks; - else - ticks = 1; - set_dec(ticks); } #ifdef CONFIG_SUSPEND -void generic_suspend_disable_irqs(void) +static void generic_suspend_disable_irqs(void) { - preempt_disable(); - /* Disable the decrementer, so that it doesn't interfere * with suspending. */ - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); local_irq_disable(); - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); } -void generic_suspend_enable_irqs(void) +static void generic_suspend_enable_irqs(void) { - wakeup_decrementer(); - local_irq_enable(); - preempt_enable(); } /* Overrides the weak version in kernel/power/main.c */ @@ -668,23 +607,6 @@ void arch_suspend_enable_irqs(void) } #endif -#ifdef CONFIG_SMP -void __init smp_space_timers(unsigned int max_cpus) -{ - int i; - u64 previous_tb = per_cpu(last_jiffy, boot_cpuid); - - /* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */ - previous_tb -= tb_ticks_per_jiffy; - - for_each_possible_cpu(i) { - if (i == boot_cpuid) - continue; - per_cpu(last_jiffy, i) = previous_tb; - } -} -#endif - /* * Scheduler clock - returns current time in nanosec units. * @@ -702,7 +624,7 @@ unsigned long long sched_clock(void) static int __init get_freq(char *name, int cells, unsigned long *val) { struct device_node *cpu; - const unsigned int *fp; + const __be32 *fp; int found = 0; /* The cpu node should have timebase and clock frequency properties */ @@ -721,6 +643,17 @@ static int __init get_freq(char *name, int cells, unsigned long *val) return found; } +void start_cpu_decrementer(void) +{ +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + /* Clear any pending timer interrupts */ + mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); + + /* Enable decrementer interrupt */ + mtspr(SPRN_TCR, TCR_DIE); +#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ +} + void __init generic_calibrate_decr(void) { ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ @@ -740,18 +673,6 @@ void __init generic_calibrate_decr(void) printk(KERN_ERR "WARNING: Estimating processor frequency " "(not found)\n"); } - -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) - /* Set the time base to zero */ - mtspr(SPRN_TBWL, 0); - mtspr(SPRN_TBWU, 0); - - /* Clear any pending timer interrupts */ - mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif } int update_persistent_clock(struct timespec now) @@ -759,7 +680,7 @@ int update_persistent_clock(struct timespec now) struct rtc_time tm; if (!ppc_md.set_rtc_time) - return 0; + return -ENODEV; to_tm(now.tv_sec + 1 + timezone_offset, &tm); tm.tm_year -= 1900; @@ -768,11 +689,12 @@ int update_persistent_clock(struct timespec now) return ppc_md.set_rtc_time(&tm); } -unsigned long read_persistent_clock(void) +static void __read_persistent_clock(struct timespec *ts) { struct rtc_time tm; static int first = 1; + ts->tv_nsec = 0; /* XXX this is a litle fragile but will work okay in the short term */ if (first) { first = 0; @@ -780,30 +702,49 @@ unsigned long read_persistent_clock(void) timezone_offset = ppc_md.time_init(); /* get_boot_time() isn't guaranteed to be safe to call late */ - if (ppc_md.get_boot_time) - return ppc_md.get_boot_time() -timezone_offset; + if (ppc_md.get_boot_time) { + ts->tv_sec = ppc_md.get_boot_time() - timezone_offset; + return; + } + } + if (!ppc_md.get_rtc_time) { + ts->tv_sec = 0; + return; } - if (!ppc_md.get_rtc_time) - return 0; ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +void read_persistent_clock(struct timespec *ts) +{ + __read_persistent_clock(ts); + + /* Sanitize it in case real time clock is set below EPOCH */ + if (ts->tv_sec < 0) { + ts->tv_sec = 0; + ts->tv_nsec = 0; + } + } /* clocksource code */ -static cycle_t rtc_read(void) +static cycle_t rtc_read(struct clocksource *cs) { return (cycle_t)get_rtc(); } -static cycle_t timebase_read(void) +static cycle_t timebase_read(struct clocksource *cs) { return (cycle_t)get_tb(); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { - u64 t2x, stamp_xsec; + u64 new_tb_to_xs, new_stamp_xsec; + u32 frac_sec; if (clock != &clocksource_timebase) return; @@ -812,27 +753,45 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) ++vdso_data->tb_update_count; smp_mb(); - /* XXX this assumes clock->shift == 22 */ - /* 4611686018 ~= 2^(20+64-22) / 1e9 */ - t2x = (u64) clock->mult * 4611686018ULL; - stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; - do_div(stamp_xsec, 1000000000); - stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - update_gtod(clock->cycle_last, stamp_xsec, t2x); + /* 19342813113834067 ~= 2^(20+64) / 1e9 */ + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; + do_div(new_stamp_xsec, 1000000000); + new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + + BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); + /* this is tv_nsec / 1e9 as a 0.32 fraction */ + frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + + /* + * tb_update_count is used to allow the userspace gettimeofday code + * to assure itself that it sees a consistent view of the tb_to_xs and + * stamp_xsec variables. It reads the tb_update_count, then reads + * tb_to_xs and stamp_xsec and then reads tb_update_count again. If + * the two values of tb_update_count match and are even then the + * tb_to_xs and stamp_xsec values are consistent. If not, then it + * loops back and reads them again until this criteria is met. + * We expect the caller to have done the first increment of + * vdso_data->tb_update_count already. + */ + vdso_data->tb_orig_stamp = clock->cycle_last; + vdso_data->stamp_xsec = new_stamp_xsec; + vdso_data->tb_to_xs = new_tb_to_xs; + vdso_data->wtom_clock_sec = wtm->tv_sec; + vdso_data->wtom_clock_nsec = wtm->tv_nsec; + vdso_data->stamp_xtime = *wall_time; + vdso_data->stamp_sec_fraction = frac_sec; + smp_wmb(); + ++(vdso_data->tb_update_count); } void update_vsyscall_tz(void) { - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; vdso_data->tz_dsttime = sys_tz.tz_dsttime; - smp_mb(); - ++vdso_data->tb_update_count; } -void __init clocksource_init(void) +static void __init clocksource_init(void) { struct clocksource *clock; @@ -841,9 +800,7 @@ void __init clocksource_init(void) else clock = &clocksource_timebase; - clock->mult = clocksource_hz2mult(tb_ticks_per_sec, clock->shift); - - if (clocksource_register(clock)) { + if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", clock->name); return; @@ -856,8 +813,13 @@ void __init clocksource_init(void) static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __get_cpu_var(decrementers).next_tb = get_tb_or_rtc() + evt; + __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt; set_dec(evt); + + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); + return 0; } @@ -868,15 +830,24 @@ static void decrementer_set_mode(enum clock_event_mode mode, decrementer_set_next_event(DECREMENTER_MAX, dev); } +/* Interrupt handler for the timer broadcast IPI */ +void tick_broadcast_ipi_handler(void) +{ + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + *next_tb = get_tb_or_rtc(); + __timer_interrupt(); +} + static void register_decrementer_clockevent(int cpu) { - struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; + struct clock_event_device *dec = &per_cpu(decrementers, cpu); *dec = decrementer_clockevent; - dec->cpumask = cpumask_of_cpu(cpu); + dec->cpumask = cpumask_of(cpu); - printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", - dec->name, dec->mult, dec->shift, cpu); + printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", + dec->name, dec->mult, dec->shift, cpu); clockevents_register_device(dec); } @@ -885,8 +856,8 @@ static void __init init_decrementer_clockevent(void) { int cpu = smp_processor_id(); - decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC, - decrementer_clockevent.shift); + clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); + decrementer_clockevent.max_delta_ns = clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); decrementer_clockevent.min_delta_ns = @@ -897,6 +868,11 @@ static void __init init_decrementer_clockevent(void) void secondary_cpu_time_init(void) { + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); + /* FIME: Should make unrelatred change to move snapshot_timebase * call here ! */ register_decrementer_clockevent(smp_processor_id()); @@ -905,15 +881,13 @@ void secondary_cpu_time_init(void) /* This function is only called on the boot processor */ void __init time_init(void) { - unsigned long flags; struct div_result res; - u64 scale, x; + u64 scale; unsigned shift; if (__USE_RTC()) { /* 601 processor: dec counts down by 128 every 128ns */ ppc_tb_freq = 1000000000; - tb_last_jiffy = get_rtcl(); } else { /* Normal PowerPC with timebase register */ ppc_md.calibrate_decr(); @@ -921,47 +895,13 @@ void __init time_init(void) ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); - tb_last_jiffy = get_tb(); } tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; - tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); - - /* - * Calculate the length of each tick in ns. It will not be - * exactly 1e9/HZ unless ppc_tb_freq is divisible by HZ. - * We compute 1e9 * tb_ticks_per_jiffy / ppc_tb_freq, - * rounded up. - */ - x = (u64) NSEC_PER_SEC * tb_ticks_per_jiffy + ppc_tb_freq - 1; - do_div(x, ppc_tb_freq); - tick_nsec = x; - last_tick_len = x << TICKLEN_SCALE; - - /* - * Compute ticklen_to_xs, which is a factor which gets multiplied - * by (last_tick_len << TICKLEN_SHIFT) to get a tb_to_xs value. - * It is computed as: - * ticklen_to_xs = 2^N / (tb_ticks_per_jiffy * 1e9) - * where N = 64 + 20 - TICKLEN_SCALE - TICKLEN_SHIFT - * which turns out to be N = 51 - SHIFT_HZ. - * This gives the result as a 0.64 fixed-point fraction. - * That value is reduced by an offset amounting to 1 xsec per - * 2^31 timebase ticks to avoid problems with time going backwards - * by 1 xsec when we do timer_recalc_offset due to losing the - * fractional xsec. That offset is equal to ppc_tb_freq/2^51 - * since there are 2^20 xsec in a second. - */ - div128_by_32((1ULL << 51) - ppc_tb_freq, 0, - tb_ticks_per_jiffy << SHIFT_HZ, &res); - div128_by_32(res.result_high, res.result_low, NSEC_PER_SEC, &res); - ticklen_to_xs = res.result_low; - - /* Compute tb_to_xs from tick_nsec */ - tb_to_xs = mulhdu(last_tick_len << TICKLEN_SHIFT, ticklen_to_xs); + setup_cputime_one_jiffy(); /* * Compute scale factor for sched_clock. @@ -984,38 +924,25 @@ void __init time_init(void) /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ boot_tb = get_tb_or_rtc(); - write_seqlock_irqsave(&xtime_lock, flags); - /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { + if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; sys_tz.tz_dsttime = 0; - } - - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - - vdso_data->tb_orig_stamp = tb_last_jiffy; + } + vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - vdso_data->tb_to_xs = tb_to_xs; - time_freq = 0; - - write_sequnlock_irqrestore(&xtime_lock, flags); + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); - /* Register the clocksource, if we're not running on iSeries */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - clocksource_init(); + /* Register the clocksource */ + clocksource_init(); init_decrementer_clockevent(); + tick_setup_hrtimer_broadcast(); } @@ -1098,39 +1025,6 @@ void to_tm(int tim, struct rtc_time * tm) GregorianDay(tm); } -/* Auxiliary function to compute scaling factors */ -/* Actually the choice of a timebase running at 1/4 the of the bus - * frequency giving resolution of a few tens of nanoseconds is quite nice. - * It makes this computation very precise (27-28 bits typically) which - * is optimistic considering the stability of most processor clock - * oscillators and the precision with which the timebase frequency - * is measured but does not harm. - */ -unsigned mulhwu_scale_factor(unsigned inscale, unsigned outscale) -{ - unsigned mlt=0, tmp, err; - /* No concern for performance, it's done once: use a stupid - * but safe and compact method to find the multiplier. - */ - - for (tmp = 1U<<31; tmp != 0; tmp >>= 1) { - if (mulhwu(inscale, mlt|tmp) < outscale) - mlt |= tmp; - } - - /* We might still be off by 1 for the best approximation. - * A side effect of this is that if outscale is too large - * the returned value will be zero. - * Many corner cases have been checked and seem to work, - * some might have been forgotten in the test however. - */ - - err = inscale * (mlt+1); - if (err <= inscale/2) - mlt++; - return mlt; -} - /* * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit * result. @@ -1163,3 +1057,26 @@ void div128_by_32(u64 dividend_high, u64 dividend_low, dr->result_low = ((u64)y << 32) + z; } + +/* We don't need to calibrate delay, we use the CPU timebase for that */ +void calibrate_delay(void) +{ + /* Some generic code (such as spinlock debug) use loops_per_jiffy + * as the number of __delay(1) in a jiffy, so make it so + */ + loops_per_jiffy = tb_ticks_per_jiffy; +} + +static int __init rtc_init(void) +{ + struct platform_device *pdev; + + if (!ppc_md.get_rtc_time) + return -ENODEV; + + pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + + return PTR_ERR_OR_ZERO(pdev); +} + +module_init(rtc_init); |
