diff options
Diffstat (limited to 'kernel/time/tick-sched.c')
| -rw-r--r-- | kernel/time/tick-sched.c | 837 | 
1 files changed, 589 insertions, 248 deletions
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,20 +19,25 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> -#include <linux/tick.h>  #include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> +#include <linux/context_tracking.h>  #include <asm/irq_regs.h>  #include "tick-internal.h" +#include <trace/events/timer.h> +  /*   * Per cpu nohz control structure   */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);  /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock.   */  static ktime_t last_jiffies_update; @@ -50,14 +55,14 @@ static void tick_do_update_jiffies64(ktime_t now)  	ktime_t delta;  	/* -	 * Do a quick check without holding xtime_lock: +	 * Do a quick check without holding jiffies_lock:  	 */  	delta = ktime_sub(now, last_jiffies_update);  	if (delta.tv64 < tick_period.tv64)  		return; -	/* Reevalute with xtime_lock held */ -	write_seqlock(&xtime_lock); +	/* Reevalute with jiffies_lock held */ +	write_seqlock(&jiffies_lock);  	delta = ktime_sub(now, last_jiffies_update);  	if (delta.tv64 >= tick_period.tv64) { @@ -79,8 +84,12 @@ static void tick_do_update_jiffies64(ktime_t now)  		/* Keep the tick_next_period variable up to date */  		tick_next_period = ktime_add(last_jiffies_update, tick_period); +	} else { +		write_sequnlock(&jiffies_lock); +		return;  	} -	write_sequnlock(&xtime_lock); +	write_sequnlock(&jiffies_lock); +	update_wall_time();  }  /* @@ -90,24 +99,274 @@ static ktime_t tick_init_jiffy_update(void)  {  	ktime_t period; -	write_seqlock(&xtime_lock); +	write_seqlock(&jiffies_lock);  	/* Did we start the jiffies update yet ? */  	if (last_jiffies_update.tv64 == 0)  		last_jiffies_update = tick_next_period;  	period = last_jiffies_update; -	write_sequnlock(&xtime_lock); +	write_sequnlock(&jiffies_lock);  	return period;  } + +static void tick_sched_do_timer(ktime_t now) +{ +	int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON +	/* +	 * Check if the do_timer duty was dropped. We don't care about +	 * concurrency: This happens only when the cpu in charge went +	 * into a long sleep. If two cpus happen to assign themself to +	 * this duty, then the jiffies update is still serialized by +	 * jiffies_lock. +	 */ +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) +	    && !tick_nohz_full_cpu(cpu)) +		tick_do_timer_cpu = cpu; +#endif + +	/* Check, if the jiffies need an update */ +	if (tick_do_timer_cpu == cpu) +		tick_do_update_jiffies64(now); +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON +	/* +	 * When we are idle and the tick is stopped, we have to touch +	 * the watchdog as we might not schedule for a really long +	 * time. This happens on complete idle SMP systems while +	 * waiting on the login prompt. We also increment the "start of +	 * idle" jiffy stamp so the idle accounting adjustment we do +	 * when we go busy again does not account too much ticks. +	 */ +	if (ts->tick_stopped) { +		touch_softlockup_watchdog(); +		if (is_idle_task(current)) +			ts->idle_jiffies++; +	} +#endif +	update_process_times(user_mode(regs)); +	profile_tick(CPU_PROFILING); +} + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; + +static bool can_stop_full_tick(void) +{ +	WARN_ON_ONCE(!irqs_disabled()); + +	if (!sched_can_stop_tick()) { +		trace_tick_stop(0, "more than 1 task in runqueue\n"); +		return false; +	} + +	if (!posix_cpu_timers_can_stop_tick(current)) { +		trace_tick_stop(0, "posix timers running\n"); +		return false; +	} + +	if (!perf_event_can_stop_tick()) { +		trace_tick_stop(0, "perf events running\n"); +		return false; +	} + +	/* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +	/* +	 * TODO: kick full dynticks CPUs when +	 * sched_clock_stable is set. +	 */ +	if (!sched_clock_stable()) { +		trace_tick_stop(0, "unstable sched clock\n"); +		/* +		 * Don't allow the user to think they can get +		 * full NO_HZ with this machine. +		 */ +		WARN_ONCE(tick_nohz_full_running, +			  "NO_HZ FULL will not work with unstable sched clock"); +		return false; +	} +#endif + +	return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void __tick_nohz_full_check(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (tick_nohz_full_cpu(smp_processor_id())) { +		if (ts->tick_stopped && !is_idle_task(current)) { +			if (!can_stop_full_tick()) +				tick_nohz_restart_sched_tick(ts, ktime_get()); +		} +	} +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ +	__tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { +	.func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ +	if (tick_nohz_full_cpu(smp_processor_id())) +		irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ +	__tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ +	if (!tick_nohz_full_running) +		return; + +	preempt_disable(); +	smp_call_function_many(tick_nohz_full_mask, +			       nohz_full_kick_ipi, NULL, false); +	tick_nohz_full_kick(); +	preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void __tick_nohz_task_switch(struct task_struct *tsk) +{ +	unsigned long flags; + +	local_irq_save(flags); + +	if (!tick_nohz_full_cpu(smp_processor_id())) +		goto out; + +	if (tick_nohz_tick_stopped() && !can_stop_full_tick()) +		tick_nohz_full_kick(); + +out: +	local_irq_restore(flags); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ +	int cpu; + +	alloc_bootmem_cpumask_var(&tick_nohz_full_mask); +	if (cpulist_parse(str, tick_nohz_full_mask) < 0) { +		pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); +		return 1; +	} + +	cpu = smp_processor_id(); +	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { +		pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); +		cpumask_clear_cpu(cpu, tick_nohz_full_mask); +	} +	tick_nohz_full_running = true; + +	return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, +						 unsigned long action, +						 void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		/* +		 * If we handle the timekeeping duty for full dynticks CPUs, +		 * we can't safely shutdown that CPU. +		 */ +		if (tick_nohz_full_running && tick_do_timer_cpu == cpu) +			return NOTIFY_BAD; +		break; +	} +	return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ +	int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL +	if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { +		pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); +		return err; +	} +	err = 0; +	cpumask_setall(tick_nohz_full_mask); +	cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); +	tick_nohz_full_running = true; +#endif +	return err; +} + +void __init tick_nohz_init(void) +{ +	int cpu; + +	if (!tick_nohz_full_running) { +		if (tick_nohz_init_all() < 0) +			return; +	} + +	for_each_cpu(cpu, tick_nohz_full_mask) +		context_tracking_cpu_set(cpu); + +	cpu_notifier(tick_nohz_cpu_down_callback, 0); +	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); +	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#endif +  /*   * NOHZ - aka dynamic tick functionality   */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * NO HZ enabled ?   */  static int tick_nohz_enabled __read_mostly  = 1; - +int tick_nohz_active  __read_mostly;  /*   * Enable / Disable tickless mode   */ @@ -136,12 +395,9 @@ __setup("nohz=", setup_tick_nohz);   */  static void tick_nohz_update_jiffies(ktime_t now)  { -	int cpu = smp_processor_id(); -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);  	unsigned long flags; -	cpumask_clear_cpu(cpu, nohz_cpu_mask); -	ts->idle_waketime = now; +	__this_cpu_write(tick_cpu_sched.idle_waketime, now);  	local_irq_save(flags);  	tick_do_update_jiffies64(now); @@ -160,9 +416,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda  	if (ts->idle_active) {  		delta = ktime_sub(now, ts->idle_entrytime); -		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);  		if (nr_iowait_cpu(cpu) > 0)  			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); +		else +			ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);  		ts->idle_entrytime = now;  	} @@ -171,23 +428,17 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda  } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)  { -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - -	update_ts_time_stats(cpu, ts, now, NULL); +	update_ts_time_stats(smp_processor_id(), ts, now, NULL);  	ts->idle_active = 0;  	sched_clock_idle_wakeup_event(0);  } -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts)  { -	ktime_t now; - -	now = ktime_get(); - -	update_ts_time_stats(cpu, ts, now, NULL); +	ktime_t now = ktime_get();  	ts->idle_entrytime = now;  	ts->idle_active = 1; @@ -198,11 +449,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)  /**   * get_cpu_idle_time_us - get the total idle time of a cpu   * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL.   *   * Return the cummulative idle time (since boot) for a given - * CPU, in microseconds. The idle time returned includes - * the iowait time (unlike what "top" and co report). + * CPU, in microseconds.   *   * This time is measured via accounting rather than sampling,   * and is as accurate as ktime_get() is. @@ -212,20 +463,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)  u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	ktime_t now, idle; -	if (!tick_nohz_enabled) +	if (!tick_nohz_active)  		return -1; -	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); +	now = ktime_get(); +	if (last_update_time) { +		update_ts_time_stats(cpu, ts, now, last_update_time); +		idle = ts->idle_sleeptime; +	} else { +		if (ts->idle_active && !nr_iowait_cpu(cpu)) { +			ktime_t delta = ktime_sub(now, ts->idle_entrytime); + +			idle = ktime_add(ts->idle_sleeptime, delta); +		} else { +			idle = ts->idle_sleeptime; +		} +	} + +	return ktime_to_us(idle); -	return ktime_to_us(ts->idle_sleeptime);  }  EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); -/* +/**   * get_cpu_iowait_time_us - get the total iowait time of a cpu   * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL.   *   * Return the cummulative iowait time (since boot) for a given   * CPU, in microseconds. @@ -238,106 +504,66 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);  u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	ktime_t now, iowait; -	if (!tick_nohz_enabled) +	if (!tick_nohz_active)  		return -1; -	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); +	now = ktime_get(); +	if (last_update_time) { +		update_ts_time_stats(cpu, ts, now, last_update_time); +		iowait = ts->iowait_sleeptime; +	} else { +		if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { +			ktime_t delta = ktime_sub(now, ts->idle_entrytime); + +			iowait = ktime_add(ts->iowait_sleeptime, delta); +		} else { +			iowait = ts->iowait_sleeptime; +		} +	} -	return ktime_to_us(ts->iowait_sleeptime); +	return ktime_to_us(iowait);  }  EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, +					 ktime_t now, int cpu)  { -	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; -	struct tick_sched *ts; -	ktime_t last_update, expires, now; +	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; +	ktime_t last_update, expires, ret = { .tv64 = 0 }; +	unsigned long rcu_delta_jiffies;  	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;  	u64 time_delta; -	int cpu; - -	local_irq_save(flags); - -	cpu = smp_processor_id(); -	ts = &per_cpu(tick_cpu_sched, cpu); - -	/* -	 * Call to tick_nohz_start_idle stops the last_update_time from being -	 * updated. Thus, it must not be called in the event we are called from -	 * irq_exit() with the prior state different than idle. -	 */ -	if (!inidle && !ts->inidle) -		goto end; - -	/* -	 * Set ts->inidle unconditionally. Even if the system did not -	 * switch to NOHZ mode the cpu frequency governers rely on the -	 * update of the idle time accounting in tick_nohz_start_idle(). -	 */ -	ts->inidle = 1; - -	now = tick_nohz_start_idle(cpu, ts); - -	/* -	 * If this cpu is offline and it is the one which updates -	 * jiffies, then give up the assignment and let it be taken by -	 * the cpu which runs the tick timer next. If we don't drop -	 * this here the jiffies might be stale and do_timer() never -	 * invoked. -	 */ -	if (unlikely(!cpu_online(cpu))) { -		if (cpu == tick_do_timer_cpu) -			tick_do_timer_cpu = TICK_DO_TIMER_NONE; -	} - -	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -		goto end; - -	if (need_resched()) -		goto end; - -	if (unlikely(local_softirq_pending() && cpu_online(cpu))) { -		static int ratelimit; -		if (ratelimit < 10) { -			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", -			       (unsigned int) local_softirq_pending()); -			ratelimit++; -		} -		goto end; -	} +	time_delta = timekeeping_max_deferment(); -	ts->idle_calls++;  	/* Read jiffies and the time when jiffies were updated last */  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&jiffies_lock);  		last_update = last_jiffies_update;  		last_jiffies = jiffies; -		time_delta = timekeeping_max_deferment(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&jiffies_lock, seq)); -	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || -	    arch_needs_cpu(cpu)) { +	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || +	    arch_needs_cpu(cpu) || irq_work_needs_cpu()) {  		next_jiffies = last_jiffies + 1;  		delta_jiffies = 1;  	} else {  		/* Get the next timer wheel timer */  		next_jiffies = get_next_timer_interrupt(last_jiffies);  		delta_jiffies = next_jiffies - last_jiffies; +		if (rcu_delta_jiffies < delta_jiffies) { +			next_jiffies = last_jiffies + rcu_delta_jiffies; +			delta_jiffies = rcu_delta_jiffies; +		}  	} +  	/* -	 * Do not stop the tick, if we are only one off -	 * or if the cpu is required for rcu +	 * Do not stop the tick, if we are only one off (or less) +	 * or if the cpu is required for RCU:  	 */ -	if (!ts->tick_stopped && delta_jiffies == 1) +	if (!ts->tick_stopped && delta_jiffies <= 1)  		goto out;  	/* Schedule the tick, if we are at least one jiffie off */ @@ -366,6 +592,13 @@ void tick_nohz_stop_sched_tick(int inidle)  			time_delta = KTIME_MAX;  		} +#ifdef CONFIG_NO_HZ_FULL +		if (!ts->inidle) { +			time_delta = min(time_delta, +					 scheduler_tick_max_deferment()); +		} +#endif +  		/*  		 * calculate the expiry time for the next timer wheel  		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -390,13 +623,12 @@ void tick_nohz_stop_sched_tick(int inidle)  		else  			expires.tv64 = KTIME_MAX; -		if (delta_jiffies > 1) -			cpumask_set_cpu(cpu, nohz_cpu_mask); -  		/* Skip reprogram of event if its not changed */  		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))  			goto out; +		ret = expires; +  		/*  		 * nohz_stop_sched_tick can be called several times before  		 * the nohz_restart_sched_tick is called. This happens when @@ -405,19 +637,14 @@ void tick_nohz_stop_sched_tick(int inidle)  		 * the scheduler tick in nohz_restart_sched_tick.  		 */  		if (!ts->tick_stopped) { -			select_nohz_load_balancer(1); +			nohz_balance_enter_idle(cpu); +			calc_load_enter_idle(); -			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); +			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);  			ts->tick_stopped = 1; -			ts->idle_jiffies = last_jiffies; -			rcu_enter_nohz(); +			trace_tick_stop(1, " ");  		} -		ts->idle_sleeps++; - -		/* Mark expires */ -		ts->idle_expires = expires; -  		/*  		 * If the expiration time == KTIME_MAX, then  		 * in this case we simply stop the tick timer. @@ -442,15 +669,162 @@ void tick_nohz_stop_sched_tick(int inidle)  		 * softirq.  		 */  		tick_do_update_jiffies64(ktime_get()); -		cpumask_clear_cpu(cpu, nohz_cpu_mask);  	}  	raise_softirq_irqoff(TIMER_SOFTIRQ);  out:  	ts->next_jiffies = next_jiffies;  	ts->last_jiffies = last_jiffies;  	ts->sleep_length = ktime_sub(dev->next_event, now); -end: -	local_irq_restore(flags); + +	return ret; +} + +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL +	int cpu = smp_processor_id(); + +	if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) +		return; + +	if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) +		return; + +	if (!can_stop_full_tick()) +		return; + +	tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ +	/* +	 * If this cpu is offline and it is the one which updates +	 * jiffies, then give up the assignment and let it be taken by +	 * the cpu which runs the tick timer next. If we don't drop +	 * this here the jiffies might be stale and do_timer() never +	 * invoked. +	 */ +	if (unlikely(!cpu_online(cpu))) { +		if (cpu == tick_do_timer_cpu) +			tick_do_timer_cpu = TICK_DO_TIMER_NONE; +		return false; +	} + +	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { +		ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; +		return false; +	} + +	if (need_resched()) +		return false; + +	if (unlikely(local_softirq_pending() && cpu_online(cpu))) { +		static int ratelimit; + +		if (ratelimit < 10 && +		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { +			pr_warn("NOHZ: local_softirq_pending %02x\n", +				(unsigned int) local_softirq_pending()); +			ratelimit++; +		} +		return false; +	} + +	if (tick_nohz_full_enabled()) { +		/* +		 * Keep the tick alive to guarantee timekeeping progression +		 * if there are full dynticks CPUs around +		 */ +		if (tick_do_timer_cpu == cpu) +			return false; +		/* +		 * Boot safety: make sure the timekeeping duty has been +		 * assigned before entering dyntick-idle mode, +		 */ +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +			return false; +	} + +	return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ +	ktime_t now, expires; +	int cpu = smp_processor_id(); + +	now = tick_nohz_start_idle(ts); + +	if (can_stop_idle_tick(cpu, ts)) { +		int was_stopped = ts->tick_stopped; + +		ts->idle_calls++; + +		expires = tick_nohz_stop_sched_tick(ts, now, cpu); +		if (expires.tv64 > 0LL) { +			ts->idle_sleeps++; +			ts->idle_expires = expires; +		} + +		if (!was_stopped && ts->tick_stopped) +			ts->idle_jiffies = ts->last_jiffies; +	} +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * + * The arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + *  to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + */ +void tick_nohz_idle_enter(void) +{ +	struct tick_sched *ts; + +	WARN_ON_ONCE(irqs_disabled()); + +	/* + 	 * Update the idle state in the scheduler domain hierarchy + 	 * when tick_nohz_stop_sched_tick() is called from the idle loop. + 	 * State will be updated to busy during the first busy tick after + 	 * exiting idle. + 	 */ +	set_cpu_sd_state_idle(); + +	local_irq_disable(); + +	ts = &__get_cpu_var(tick_cpu_sched); +	ts->inidle = 1; +	__tick_nohz_idle_enter(ts); + +	local_irq_enable(); +} +EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (ts->inidle) +		__tick_nohz_idle_enter(ts); +	else +		tick_nohz_full_stop_tick(ts);  }  /** @@ -468,7 +842,7 @@ ktime_t tick_nohz_get_sleep_length(void)  static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)  {  	hrtimer_cancel(&ts->sched_timer); -	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); +	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);  	while (1) {  		/* Forward the time to expire in the future */ @@ -485,49 +859,36 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)  				hrtimer_get_expires(&ts->sched_timer), 0))  				break;  		} -		/* Update jiffies and reread time */ -		tick_do_update_jiffies64(now); +		/* Reread time and update jiffies */  		now = ktime_get(); +		tick_do_update_jiffies64(now);  	}  } -/** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - */ -void tick_nohz_restart_sched_tick(void) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  { -	int cpu = smp_processor_id(); -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -	unsigned long ticks; -#endif -	ktime_t now; - -	local_irq_disable(); -	if (ts->idle_active || (ts->inidle && ts->tick_stopped)) -		now = ktime_get(); - -	if (ts->idle_active) -		tick_nohz_stop_idle(cpu, now); - -	if (!ts->inidle || !ts->tick_stopped) { -		ts->inidle = 0; -		local_irq_enable(); -		return; -	} +	/* Update jiffies first */ +	tick_do_update_jiffies64(now); +	update_cpu_load_nohz(); -	ts->inidle = 0; +	calc_load_exit_idle(); +	touch_softlockup_watchdog(); +	/* +	 * Cancel the scheduled timer and restore the tick +	 */ +	ts->tick_stopped  = 0; +	ts->idle_exittime = now; -	rcu_exit_nohz(); +	tick_nohz_restart(ts, now); +} -	/* Update jiffies first */ -	select_nohz_load_balancer(0); -	tick_do_update_jiffies64(now); -	cpumask_clear_cpu(cpu, nohz_cpu_mask); +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +	unsigned long ticks; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +	if (vtime_accounting_enabled()) +		return;  	/*  	 * We stopped the tick in idle. Update process times would miss the  	 * time we slept as update_process_times does only a 1 tick @@ -540,18 +901,40 @@ void tick_nohz_restart_sched_tick(void)  	if (ticks && ticks < LONG_MAX)  		account_idle_ticks(ticks);  #endif +} -	touch_softlockup_watchdog(); -	/* -	 * Cancel the scheduled timer and restore the tick -	 */ -	ts->tick_stopped  = 0; -	ts->idle_exittime = now; +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); +	ktime_t now; -	tick_nohz_restart(ts, now); +	local_irq_disable(); + +	WARN_ON_ONCE(!ts->inidle); + +	ts->inidle = 0; + +	if (ts->idle_active || ts->tick_stopped) +		now = ktime_get(); + +	if (ts->idle_active) +		tick_nohz_stop_idle(ts, now); + +	if (ts->tick_stopped) { +		tick_nohz_restart_sched_tick(ts, now); +		tick_nohz_account_idle_ticks(ts); +	}  	local_irq_enable();  } +EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)  { @@ -566,40 +949,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)  {  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);  	struct pt_regs *regs = get_irq_regs(); -	int cpu = smp_processor_id();  	ktime_t now = ktime_get();  	dev->next_event.tv64 = KTIME_MAX; -	/* -	 * Check if the do_timer duty was dropped. We don't care about -	 * concurrency: This happens only when the cpu in charge went -	 * into a long sleep. If two cpus happen to assign themself to -	 * this duty, then the jiffies update is still serialized by -	 * xtime_lock. -	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) -		tick_do_timer_cpu = cpu; - -	/* Check, if the jiffies need an update */ -	if (tick_do_timer_cpu == cpu) -		tick_do_update_jiffies64(now); - -	/* -	 * When we are idle and the tick is stopped, we have to touch -	 * the watchdog as we might not schedule for a really long -	 * time. This happens on complete idle SMP systems while -	 * waiting on the login prompt. We also increment the "start -	 * of idle" jiffy stamp so the idle accounting adjustment we -	 * do when we go busy again does not account too much ticks. -	 */ -	if (ts->tick_stopped) { -		touch_softlockup_watchdog(); -		ts->idle_jiffies++; -	} - -	update_process_times(user_mode(regs)); -	profile_tick(CPU_PROFILING); +	tick_sched_do_timer(now); +	tick_sched_handle(ts, regs);  	while (tick_nohz_reprogram(ts, now)) {  		now = ktime_get(); @@ -623,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)  		local_irq_enable();  		return;  	} - +	tick_nohz_active = 1;  	ts->nohz_mode = NOHZ_MODE_LOWRES;  	/* @@ -641,9 +996,6 @@ static void tick_nohz_switch_to_nohz(void)  		next = ktime_add(next, tick_period);  	}  	local_irq_enable(); - -	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", -	       smp_processor_id());  }  /* @@ -657,12 +1009,10 @@ static void tick_nohz_switch_to_nohz(void)   * timer and do not touch the other magic bits which need to be done   * when idle is left.   */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)  {  #if 0  	/* Switch back to 2.6.27 behaviour */ - -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);  	ktime_t delta;  	/* @@ -677,36 +1027,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)  #endif  } -static inline void tick_check_nohz(int cpu) +static inline void tick_nohz_irq_enter(void)  { -	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);  	ktime_t now;  	if (!ts->idle_active && !ts->tick_stopped)  		return;  	now = ktime_get();  	if (ts->idle_active) -		tick_nohz_stop_idle(cpu, now); +		tick_nohz_stop_idle(ts, now);  	if (ts->tick_stopped) {  		tick_nohz_update_jiffies(now); -		tick_nohz_kick_tick(cpu, now); +		tick_nohz_kick_tick(ts, now);  	}  }  #else  static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_nohz_irq_enter(void) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * Called from irq_enter to notify about the possible interruption of idle()   */ -void tick_check_idle(int cpu) +void tick_irq_enter(void)  { -	tick_check_oneshot_broadcast(cpu); -	tick_check_nohz(cpu); +	tick_check_oneshot_broadcast_this_cpu(); +	tick_nohz_irq_enter();  }  /* @@ -715,7 +1065,7 @@ void tick_check_idle(int cpu)  #ifdef CONFIG_HIGH_RES_TIMERS  /*   * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled.   */  static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  { @@ -723,50 +1073,31 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  		container_of(timer, struct tick_sched, sched_timer);  	struct pt_regs *regs = get_irq_regs();  	ktime_t now = ktime_get(); -	int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ -	/* -	 * Check if the do_timer duty was dropped. We don't care about -	 * concurrency: This happens only when the cpu in charge went -	 * into a long sleep. If two cpus happen to assign themself to -	 * this duty, then the jiffies update is still serialized by -	 * xtime_lock. -	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) -		tick_do_timer_cpu = cpu; -#endif - -	/* Check, if the jiffies need an update */ -	if (tick_do_timer_cpu == cpu) -		tick_do_update_jiffies64(now); +	tick_sched_do_timer(now);  	/*  	 * Do not call, when we are not in irq context and have  	 * no valid regs pointer  	 */ -	if (regs) { -		/* -		 * When we are idle and the tick is stopped, we have to touch -		 * the watchdog as we might not schedule for a really long -		 * time. This happens on complete idle SMP systems while -		 * waiting on the login prompt. We also increment the "start of -		 * idle" jiffy stamp so the idle accounting adjustment we do -		 * when we go busy again does not account too much ticks. -		 */ -		if (ts->tick_stopped) { -			touch_softlockup_watchdog(); -			ts->idle_jiffies++; -		} -		update_process_times(user_mode(regs)); -		profile_tick(CPU_PROFILING); -	} +	if (regs) +		tick_sched_handle(ts, regs);  	hrtimer_forward(timer, now, tick_period);  	return HRTIMER_RESTART;  } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ +	get_option(&str, &sched_skew_tick); + +	return 0; +} +early_param("skew_tick", skew_tick); +  /**   * tick_setup_sched_timer - setup the tick emulation timer   */ @@ -784,6 +1115,14 @@ void tick_setup_sched_timer(void)  	/* Get the next period (per cpu) */  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); +	/* Offset the tick to avert jiffies_lock contention. */ +	if (sched_skew_tick) { +		u64 offset = ktime_to_ns(tick_period) >> 1; +		do_div(offset, num_possible_cpus()); +		offset *= smp_processor_id(); +		hrtimer_add_expires_ns(&ts->sched_timer, offset); +	} +  	for (;;) {  		hrtimer_forward(&ts->sched_timer, now, tick_period);  		hrtimer_start_expires(&ts->sched_timer, @@ -794,14 +1133,16 @@ void tick_setup_sched_timer(void)  		now = ktime_get();  	} -#ifdef CONFIG_NO_HZ -	if (tick_nohz_enabled) +#ifdef CONFIG_NO_HZ_COMMON +	if (tick_nohz_enabled) {  		ts->nohz_mode = NOHZ_MODE_HIGHRES; +		tick_nohz_active = 1; +	}  #endif  }  #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS  void tick_cancel_sched_timer(int cpu)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -811,7 +1152,7 @@ void tick_cancel_sched_timer(int cpu)  		hrtimer_cancel(&ts->sched_timer);  # endif -	ts->nohz_mode = NOHZ_MODE_INACTIVE; +	memset(ts, 0, sizeof(*ts));  }  #endif  | 
