diff options
Diffstat (limited to 'kernel/posix-cpu-timers.c')
| -rw-r--r-- | kernel/posix-cpu-timers.c | 1002 | 
1 files changed, 428 insertions, 574 deletions
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba587..3b8946416a5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,6 +9,9 @@  #include <asm/uaccess.h>  #include <linux/kernel_stat.h>  #include <trace/events/timer.h> +#include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h>  /*   * Called after updating RLIMIT_CPU to run cpu timer and update @@ -37,83 +40,39 @@ static int check_clock(const clockid_t which_clock)  	if (pid == 0)  		return 0; -	read_lock(&tasklist_lock); +	rcu_read_lock();  	p = find_task_by_vpid(pid);  	if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? -		   same_thread_group(p, current) : thread_group_leader(p))) { +		   same_thread_group(p, current) : has_group_leader_pid(p))) {  		error = -EINVAL;  	} -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	return error;  } -static inline union cpu_time_count +static inline unsigned long long  timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)  { -	union cpu_time_count ret; -	ret.sched = 0;		/* high half always zero when .cpu used */ +	unsigned long long ret; + +	ret = 0;		/* high half always zero when .cpu used */  	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; +		ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;  	} else { -		ret.cpu = timespec_to_cputime(tp); +		ret = cputime_to_expires(timespec_to_cputime(tp));  	}  	return ret;  }  static void sample_to_timespec(const clockid_t which_clock, -			       union cpu_time_count cpu, +			       unsigned long long expires,  			       struct timespec *tp)  {  	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) -		*tp = ns_to_timespec(cpu.sched); +		*tp = ns_to_timespec(expires);  	else -		cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, -				  union cpu_time_count now, -				  union cpu_time_count then) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		return now.sched < then.sched; -	}  else { -		return cputime_lt(now.cpu, then.cpu); -	} -} -static inline void cpu_time_add(const clockid_t which_clock, -				union cpu_time_count *acc, -			        union cpu_time_count val) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		acc->sched += val.sched; -	}  else { -		acc->cpu = cputime_add(acc->cpu, val.cpu); -	} -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, -						union cpu_time_count a, -						union cpu_time_count b) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		a.sched -= b.sched; -	}  else { -		a.cpu = cputime_sub(a.cpu, b.cpu); -	} -	return a; -} - -/* - * Divide and limit the result to res >= 1 - * - * This is necessary to prevent signal delivery starvation, when the result of - * the division would be rounded down to 0. - */ -static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) -{ -	cputime_t res = cputime_div(time, div); - -	return max_t(cputime_t, res, 1); +		cputime_to_timespec((__force cputime_t)expires, tp);  }  /* @@ -121,62 +80,68 @@ static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)   * given the current clock sample.   */  static void bump_cpu_timer(struct k_itimer *timer, -				  union cpu_time_count now) +			   unsigned long long now)  {  	int i; +	unsigned long long delta, incr; -	if (timer->it.cpu.incr.sched == 0) +	if (timer->it.cpu.incr == 0)  		return; -	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { -		unsigned long long delta, incr; +	if (now < timer->it.cpu.expires) +		return; -		if (now.sched < timer->it.cpu.expires.sched) -			return; -		incr = timer->it.cpu.incr.sched; -		delta = now.sched + incr - timer->it.cpu.expires.sched; -		/* Don't use (incr*2 < delta), incr*2 might overflow. */ -		for (i = 0; incr < delta - incr; i++) -			incr = incr << 1; -		for (; i >= 0; incr >>= 1, i--) { -			if (delta < incr) -				continue; -			timer->it.cpu.expires.sched += incr; -			timer->it_overrun += 1 << i; -			delta -= incr; -		} -	} else { -		cputime_t delta, incr; +	incr = timer->it.cpu.incr; +	delta = now + incr - timer->it.cpu.expires; -		if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) -			return; -		incr = timer->it.cpu.incr.cpu; -		delta = cputime_sub(cputime_add(now.cpu, incr), -				    timer->it.cpu.expires.cpu); -		/* Don't use (incr*2 < delta), incr*2 might overflow. */ -		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) -			     incr = cputime_add(incr, incr); -		for (; i >= 0; incr = cputime_halve(incr), i--) { -			if (cputime_lt(delta, incr)) -				continue; -			timer->it.cpu.expires.cpu = -				cputime_add(timer->it.cpu.expires.cpu, incr); -			timer->it_overrun += 1 << i; -			delta = cputime_sub(delta, incr); -		} +	/* Don't use (incr*2 < delta), incr*2 might overflow. */ +	for (i = 0; incr < delta - incr; i++) +		incr = incr << 1; + +	for (; i >= 0; incr >>= 1, i--) { +		if (delta < incr) +			continue; + +		timer->it.cpu.expires += incr; +		timer->it_overrun += 1 << i; +		delta -= incr;  	}  } -static inline cputime_t prof_ticks(struct task_struct *p) +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime:	The struct to compare. + * + * Checks @cputime to see if all fields are zero.  Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime)  { -	return cputime_add(p->utime, p->stime); +	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) +		return 1; +	return 0;  } -static inline cputime_t virt_ticks(struct task_struct *p) + +static inline unsigned long long prof_ticks(struct task_struct *p)  { -	return p->utime; +	cputime_t utime, stime; + +	task_cputime(p, &utime, &stime); + +	return cputime_to_expires(utime + stime); +} +static inline unsigned long long virt_ticks(struct task_struct *p) +{ +	cputime_t utime; + +	task_cputime(p, &utime, NULL); + +	return cputime_to_expires(utime);  } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)  {  	int error = check_clock(which_clock);  	if (!error) { @@ -194,7 +159,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)  	return error;  } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)  {  	/*  	 * You can never reset a CPU clock, but we check for other errors @@ -212,54 +178,30 @@ int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)   * Sample a per-thread clock for the given task.   */  static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, -			    union cpu_time_count *cpu) +			    unsigned long long *sample)  {  	switch (CPUCLOCK_WHICH(which_clock)) {  	default:  		return -EINVAL;  	case CPUCLOCK_PROF: -		cpu->cpu = prof_ticks(p); +		*sample = prof_ticks(p);  		break;  	case CPUCLOCK_VIRT: -		cpu->cpu = virt_ticks(p); +		*sample = virt_ticks(p);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = task_sched_runtime(p); +		*sample = task_sched_runtime(p);  		break;  	}  	return 0;  } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ -	struct signal_struct *sig = tsk->signal; -	struct task_struct *t; - -	times->utime = sig->utime; -	times->stime = sig->stime; -	times->sum_exec_runtime = sig->sum_sched_runtime; - -	rcu_read_lock(); -	/* make sure we can trust tsk->thread_group list */ -	if (!likely(pid_alive(tsk))) -		goto out; - -	t = tsk; -	do { -		times->utime = cputime_add(times->utime, t->utime); -		times->stime = cputime_add(times->stime, t->stime); -		times->sum_exec_runtime += t->se.sum_exec_runtime; -	} while_each_thread(tsk, t); -out: -	rcu_read_unlock(); -} -  static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)  { -	if (cputime_gt(b->utime, a->utime)) +	if (b->utime > a->utime)  		a->utime = b->utime; -	if (cputime_gt(b->stime, a->stime)) +	if (b->stime > a->stime)  		a->stime = b->stime;  	if (b->sum_exec_runtime > a->sum_exec_runtime) @@ -272,9 +214,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)  	struct task_cputime sum;  	unsigned long flags; -	spin_lock_irqsave(&cputimer->lock, flags);  	if (!cputimer->running) { -		cputimer->running = 1;  		/*  		 * The POSIX timer interface allows for absolute time expiry  		 * values through the TIMER_ABSTIME flag, therefore we have @@ -282,19 +222,23 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)  		 * it.  		 */  		thread_group_cputime(tsk, &sum); +		raw_spin_lock_irqsave(&cputimer->lock, flags); +		cputimer->running = 1;  		update_gt_cputime(&cputimer->cputime, &sum); -	} +	} else +		raw_spin_lock_irqsave(&cputimer->lock, flags);  	*times = cputimer->cputime; -	spin_unlock_irqrestore(&cputimer->lock, flags); +	raw_spin_unlock_irqrestore(&cputimer->lock, flags);  }  /*   * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal.   */  static int cpu_clock_sample_group(const clockid_t which_clock,  				  struct task_struct *p, -				  union cpu_time_count *cpu) +				  unsigned long long *sample)  {  	struct task_cputime cputime; @@ -303,43 +247,67 @@ static int cpu_clock_sample_group(const clockid_t which_clock,  		return -EINVAL;  	case CPUCLOCK_PROF:  		thread_group_cputime(p, &cputime); -		cpu->cpu = cputime_add(cputime.utime, cputime.stime); +		*sample = cputime_to_expires(cputime.utime + cputime.stime);  		break;  	case CPUCLOCK_VIRT:  		thread_group_cputime(p, &cputime); -		cpu->cpu = cputime.utime; +		*sample = cputime_to_expires(cputime.utime);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = thread_group_sched_runtime(p); +		thread_group_cputime(p, &cputime); +		*sample = cputime.sum_exec_runtime;  		break;  	}  	return 0;  } +static int posix_cpu_clock_get_task(struct task_struct *tsk, +				    const clockid_t which_clock, +				    struct timespec *tp) +{ +	int err = -EINVAL; +	unsigned long long rtn; + +	if (CPUCLOCK_PERTHREAD(which_clock)) { +		if (same_thread_group(tsk, current)) +			err = cpu_clock_sample(which_clock, tsk, &rtn); +	} else { +		unsigned long flags; +		struct sighand_struct *sighand; + +		/* +		 * while_each_thread() is not yet entirely RCU safe, +		 * keep locking the group while sampling process +		 * clock for now. +		 */ +		sighand = lock_task_sighand(tsk, &flags); +		if (!sighand) +			return err; + +		if (tsk == current || thread_group_leader(tsk)) +			err = cpu_clock_sample_group(which_clock, tsk, &rtn); + +		unlock_task_sighand(tsk, &flags); +	} + +	if (!err) +		sample_to_timespec(which_clock, rtn, tp); + +	return err; +} + -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)  {  	const pid_t pid = CPUCLOCK_PID(which_clock); -	int error = -EINVAL; -	union cpu_time_count rtn; +	int err = -EINVAL;  	if (pid == 0) {  		/*  		 * Special case constant value for our own clocks.  		 * We don't have to do any lookup to find ourselves.  		 */ -		if (CPUCLOCK_PERTHREAD(which_clock)) { -			/* -			 * Sampling just ourselves we can do with no locking. -			 */ -			error = cpu_clock_sample(which_clock, -						 current, &rtn); -		} else { -			read_lock(&tasklist_lock); -			error = cpu_clock_sample_group(which_clock, -						       current, &rtn); -			read_unlock(&tasklist_lock); -		} +		err = posix_cpu_clock_get_task(current, which_clock, tp);  	} else {  		/*  		 * Find the given PID, and validate that the caller @@ -348,29 +316,12 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)  		struct task_struct *p;  		rcu_read_lock();  		p = find_task_by_vpid(pid); -		if (p) { -			if (CPUCLOCK_PERTHREAD(which_clock)) { -				if (same_thread_group(p, current)) { -					error = cpu_clock_sample(which_clock, -								 p, &rtn); -				} -			} else { -				read_lock(&tasklist_lock); -				if (thread_group_leader(p) && p->sighand) { -					error = -					    cpu_clock_sample_group(which_clock, -							           p, &rtn); -				} -				read_unlock(&tasklist_lock); -			} -		} +		if (p) +			err = posix_cpu_clock_get_task(p, which_clock, tp);  		rcu_read_unlock();  	} -	if (error) -		return error; -	sample_to_timespec(which_clock, rtn, tp); -	return 0; +	return err;  } @@ -379,7 +330,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)   * This is called from sys_timer_create() and do_cpu_nanosleep() with the   * new timer already all-zeros initialized.   */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer)  {  	int ret = 0;  	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -390,7 +341,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)  	INIT_LIST_HEAD(&new_timer->it.cpu.entry); -	read_lock(&tasklist_lock); +	rcu_read_lock();  	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {  		if (pid == 0) {  			p = current; @@ -404,7 +355,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)  			p = current->group_leader;  		} else {  			p = find_task_by_vpid(pid); -			if (p && !thread_group_leader(p)) +			if (p && !has_group_leader_pid(p))  				p = NULL;  		}  	} @@ -414,7 +365,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)  	} else {  		ret = -EINVAL;  	} -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	return ret;  } @@ -425,79 +376,60 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)   * If we return TIMER_RETRY, it's necessary to release the timer's lock   * and try again.  (This happens when the timer is in the middle of firing.)   */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer)  { -	struct task_struct *p = timer->it.cpu.task;  	int ret = 0; +	unsigned long flags; +	struct sighand_struct *sighand; +	struct task_struct *p = timer->it.cpu.task; -	if (likely(p != NULL)) { -		read_lock(&tasklist_lock); -		if (unlikely(p->sighand == NULL)) { -			/* -			 * We raced with the reaping of the task. -			 * The deletion should have cleared us off the list. -			 */ -			BUG_ON(!list_empty(&timer->it.cpu.entry)); -		} else { -			spin_lock(&p->sighand->siglock); -			if (timer->it.cpu.firing) -				ret = TIMER_RETRY; -			else -				list_del(&timer->it.cpu.entry); -			spin_unlock(&p->sighand->siglock); -		} -		read_unlock(&tasklist_lock); +	WARN_ON_ONCE(p == NULL); + +	/* +	 * Protect against sighand release/switch in exit/exec and process/ +	 * thread timer list entry concurrent read/writes. +	 */ +	sighand = lock_task_sighand(p, &flags); +	if (unlikely(sighand == NULL)) { +		/* +		 * We raced with the reaping of the task. +		 * The deletion should have cleared us off the list. +		 */ +		WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); +	} else { +		if (timer->it.cpu.firing) +			ret = TIMER_RETRY; +		else +			list_del(&timer->it.cpu.entry); -		if (!ret) -			put_task_struct(p); +		unlock_task_sighand(p, &flags);  	} +	if (!ret) +		put_task_struct(p); +  	return ret;  } +static void cleanup_timers_list(struct list_head *head) +{ +	struct cpu_timer_list *timer, *next; + +	list_for_each_entry_safe(timer, next, head, entry) +		list_del_init(&timer->entry); +} +  /*   * Clean out CPU timers still ticking when a thread exited.  The task   * pointer is cleared, and the expiry time is replaced with the residual   * time for later timer_gettime calls to return.   * This must be called with the siglock held.   */ -static void cleanup_timers(struct list_head *head, -			   cputime_t utime, cputime_t stime, -			   unsigned long long sum_exec_runtime) +static void cleanup_timers(struct list_head *head)  { -	struct cpu_timer_list *timer, *next; -	cputime_t ptime = cputime_add(utime, stime); - -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (cputime_lt(timer->expires.cpu, ptime)) { -			timer->expires.cpu = cputime_zero; -		} else { -			timer->expires.cpu = cputime_sub(timer->expires.cpu, -							 ptime); -		} -	} - -	++head; -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (cputime_lt(timer->expires.cpu, utime)) { -			timer->expires.cpu = cputime_zero; -		} else { -			timer->expires.cpu = cputime_sub(timer->expires.cpu, -							 utime); -		} -	} - -	++head; -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (timer->expires.sched < sum_exec_runtime) { -			timer->expires.sched = 0; -		} else { -			timer->expires.sched -= sum_exec_runtime; -		} -	} +	cleanup_timers_list(head); +	cleanup_timers_list(++head); +	cleanup_timers_list(++head);  }  /* @@ -507,43 +439,24 @@ static void cleanup_timers(struct list_head *head,   */  void posix_cpu_timers_exit(struct task_struct *tsk)  { -	cleanup_timers(tsk->cpu_timers, -		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); +	add_device_randomness((const void*) &tsk->se.sum_exec_runtime, +						sizeof(unsigned long long)); +	cleanup_timers(tsk->cpu_timers);  }  void posix_cpu_timers_exit_group(struct task_struct *tsk)  { -	struct signal_struct *const sig = tsk->signal; - -	cleanup_timers(tsk->signal->cpu_timers, -		       cputime_add(tsk->utime, sig->utime), -		       cputime_add(tsk->stime, sig->stime), -		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime); -} - -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -{ -	/* -	 * That's all for this thread or process. -	 * We leave our residual in expires to be reported. -	 */ -	put_task_struct(timer->it.cpu.task); -	timer->it.cpu.task = NULL; -	timer->it.cpu.expires = cpu_time_sub(timer->it_clock, -					     timer->it.cpu.expires, -					     now); +	cleanup_timers(tsk->signal->cpu_timers);  }  static inline int expires_gt(cputime_t expires, cputime_t new_exp)  { -	return cputime_eq(expires, cputime_zero) || -	       cputime_gt(expires, new_exp); +	return expires == 0 || expires > new_exp;  }  /*   * Insert the timer on the appropriate list before any timers that - * expire later.  This must be called with the tasklist_lock held - * for reading, interrupts disabled and p->sighand->siglock taken. + * expire later.  This must be called with the sighand lock held.   */  static void arm_timer(struct k_itimer *timer)  { @@ -564,14 +477,14 @@ static void arm_timer(struct k_itimer *timer)  	listpos = head;  	list_for_each_entry(next, head, entry) { -		if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) +		if (nt->expires < next->expires)  			break;  		listpos = &next->entry;  	}  	list_add(&nt->entry, listpos);  	if (listpos == head) { -		union cpu_time_count *exp = &nt->expires; +		unsigned long long exp = nt->expires;  		/*  		 * We are the new earliest-expiring POSIX 1.b timer, hence @@ -582,17 +495,17 @@ static void arm_timer(struct k_itimer *timer)  		switch (CPUCLOCK_WHICH(timer->it_clock)) {  		case CPUCLOCK_PROF: -			if (expires_gt(cputime_expires->prof_exp, exp->cpu)) -				cputime_expires->prof_exp = exp->cpu; +			if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) +				cputime_expires->prof_exp = expires_to_cputime(exp);  			break;  		case CPUCLOCK_VIRT: -			if (expires_gt(cputime_expires->virt_exp, exp->cpu)) -				cputime_expires->virt_exp = exp->cpu; +			if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) +				cputime_expires->virt_exp = expires_to_cputime(exp);  			break;  		case CPUCLOCK_SCHED:  			if (cputime_expires->sched_exp == 0 || -			    cputime_expires->sched_exp > exp->sched) -				cputime_expires->sched_exp = exp->sched; +			    cputime_expires->sched_exp > exp) +				cputime_expires->sched_exp = exp;  			break;  		}  	} @@ -607,20 +520,20 @@ static void cpu_timer_fire(struct k_itimer *timer)  		/*  		 * User don't want any signal.  		 */ -		timer->it.cpu.expires.sched = 0; +		timer->it.cpu.expires = 0;  	} else if (unlikely(timer->sigq == NULL)) {  		/*  		 * This a special case for clock_nanosleep,  		 * not a normal timer from sys_timer_create.  		 */  		wake_up_process(timer->it_process); -		timer->it.cpu.expires.sched = 0; -	} else if (timer->it.cpu.incr.sched == 0) { +		timer->it.cpu.expires = 0; +	} else if (timer->it.cpu.incr == 0) {  		/*  		 * One-shot timer.  Clear it as soon as it's fired.  		 */  		posix_timer_event(timer, 0); -		timer->it.cpu.expires.sched = 0; +		timer->it.cpu.expires = 0;  	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {  		/*  		 * The signal did not get queued because the signal @@ -634,11 +547,12 @@ static void cpu_timer_fire(struct k_itimer *timer)  /*   * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal.   */  static int cpu_timer_sample_group(const clockid_t which_clock,  				  struct task_struct *p, -				  union cpu_time_count *cpu) +				  unsigned long long *sample)  {  	struct task_cputime cputime; @@ -647,61 +561,89 @@ static int cpu_timer_sample_group(const clockid_t which_clock,  	default:  		return -EINVAL;  	case CPUCLOCK_PROF: -		cpu->cpu = cputime_add(cputime.utime, cputime.stime); +		*sample = cputime_to_expires(cputime.utime + cputime.stime);  		break;  	case CPUCLOCK_VIRT: -		cpu->cpu = cputime.utime; +		*sample = cputime_to_expires(cputime.utime);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); +		*sample = cputime.sum_exec_runtime + task_delta_exec(p);  		break;  	}  	return 0;  } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ +	tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ +	if (context_tracking_is_enabled()) +		schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ +	if (!task_cputime_zero(&tsk->cputime_expires)) +		return false; + +	if (tsk->signal->cputimer.running) +		return false; + +	return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif +  /*   * Guts of sys_timer_settime for CPU timers.   * This is called with the timer locked and interrupts disabled.   * If we return TIMER_RETRY, it's necessary to release the timer's lock   * and try again.  (This happens when the timer is in the middle of firing.)   */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, -			struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, +			       struct itimerspec *new, struct itimerspec *old)  { +	unsigned long flags; +	struct sighand_struct *sighand;  	struct task_struct *p = timer->it.cpu.task; -	union cpu_time_count old_expires, new_expires, old_incr, val; +	unsigned long long old_expires, new_expires, old_incr, val;  	int ret; -	if (unlikely(p == NULL)) { -		/* -		 * Timer refers to a dead task's clock. -		 */ -		return -ESRCH; -	} +	WARN_ON_ONCE(p == NULL);  	new_expires = timespec_to_sample(timer->it_clock, &new->it_value); -	read_lock(&tasklist_lock);  	/* -	 * We need the tasklist_lock to protect against reaping that -	 * clears p->sighand.  If p has just been reaped, we can no +	 * Protect against sighand release/switch in exit/exec and p->cpu_timers +	 * and p->signal->cpu_timers read/write in arm_timer() +	 */ +	sighand = lock_task_sighand(p, &flags); +	/* +	 * If p has just been reaped, we can no  	 * longer get any information about it at all.  	 */ -	if (unlikely(p->sighand == NULL)) { -		read_unlock(&tasklist_lock); -		put_task_struct(p); -		timer->it.cpu.task = NULL; +	if (unlikely(sighand == NULL)) {  		return -ESRCH;  	}  	/*  	 * Disarm any old timer after extracting its expiry time.  	 */ -	BUG_ON(!irqs_disabled()); +	WARN_ON_ONCE(!irqs_disabled());  	ret = 0;  	old_incr = timer->it.cpu.incr; -	spin_lock(&p->sighand->siglock);  	old_expires = timer->it.cpu.expires;  	if (unlikely(timer->it.cpu.firing)) {  		timer->it.cpu.firing = -1; @@ -724,7 +666,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	}  	if (old) { -		if (old_expires.sched == 0) { +		if (old_expires == 0) {  			old->it_value.tv_sec = 0;  			old->it_value.tv_nsec = 0;  		} else { @@ -739,11 +681,8 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  			 * new setting.  			 */  			bump_cpu_timer(timer, val); -			if (cpu_time_before(timer->it_clock, val, -					    timer->it.cpu.expires)) { -				old_expires = cpu_time_sub( -					timer->it_clock, -					timer->it.cpu.expires, val); +			if (val < timer->it.cpu.expires) { +				old_expires = timer->it.cpu.expires - val;  				sample_to_timespec(timer->it_clock,  						   old_expires,  						   &old->it_value); @@ -761,13 +700,12 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  		 * disable this firing since we are already reporting  		 * it as an overrun (thanks to bump_cpu_timer above).  		 */ -		spin_unlock(&p->sighand->siglock); -		read_unlock(&tasklist_lock); +		unlock_task_sighand(p, &flags);  		goto out;  	} -	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { -		cpu_time_add(timer->it_clock, &new_expires, val); +	if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { +		new_expires += val;  	}  	/* @@ -776,14 +714,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	 * arm the timer (we'll just fake it for timer_gettime).  	 */  	timer->it.cpu.expires = new_expires; -	if (new_expires.sched != 0 && -	    cpu_time_before(timer->it_clock, val, new_expires)) { +	if (new_expires != 0 && val < new_expires) {  		arm_timer(timer);  	} -	spin_unlock(&p->sighand->siglock); -	read_unlock(&tasklist_lock); - +	unlock_task_sighand(p, &flags);  	/*  	 * Install the new reload setting, and  	 * set up the signal and overrun bookkeeping. @@ -801,8 +736,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	timer->it_overrun_last = 0;  	timer->it_overrun = -1; -	if (new_expires.sched != 0 && -	    !cpu_time_before(timer->it_clock, val, new_expires)) { +	if (new_expires != 0 && !(val < new_expires)) {  		/*  		 * The designated time already passed, so we notify  		 * immediately, even if the thread never runs to @@ -817,14 +751,17 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,  		sample_to_timespec(timer->it_clock,  				   old_incr, &old->it_interval);  	} +	if (!ret) +		posix_cpu_timer_kick_nohz();  	return ret;  } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  { -	union cpu_time_count now; +	unsigned long long now;  	struct task_struct *p = timer->it.cpu.task; -	int clear_dead; + +	WARN_ON_ONCE(p == NULL);  	/*  	 * Easy part: convert the reload time. @@ -832,63 +769,44 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  	sample_to_timespec(timer->it_clock,  			   timer->it.cpu.incr, &itp->it_interval); -	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */ +	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */  		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;  		return;  	} -	if (unlikely(p == NULL)) { -		/* -		 * This task already died and the timer will never fire. -		 * In this case, expires is actually the dead value. -		 */ -	dead: -		sample_to_timespec(timer->it_clock, timer->it.cpu.expires, -				   &itp->it_value); -		return; -	} -  	/*  	 * Sample the clock to take the difference with the expiry time.  	 */  	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {  		cpu_clock_sample(timer->it_clock, p, &now); -		clear_dead = p->exit_state;  	} else { -		read_lock(&tasklist_lock); -		if (unlikely(p->sighand == NULL)) { +		struct sighand_struct *sighand; +		unsigned long flags; + +		/* +		 * Protect against sighand release/switch in exit/exec and +		 * also make timer sampling safe if it ends up calling +		 * thread_group_cputime(). +		 */ +		sighand = lock_task_sighand(p, &flags); +		if (unlikely(sighand == NULL)) {  			/*  			 * The process has been reaped.  			 * We can't even collect a sample any more.  			 * Call the timer disarmed, nothing else to do.  			 */ -			put_task_struct(p); -			timer->it.cpu.task = NULL; -			timer->it.cpu.expires.sched = 0; -			read_unlock(&tasklist_lock); -			goto dead; +			timer->it.cpu.expires = 0; +			sample_to_timespec(timer->it_clock, timer->it.cpu.expires, +					   &itp->it_value);  		} else {  			cpu_timer_sample_group(timer->it_clock, p, &now); -			clear_dead = (unlikely(p->exit_state) && -				      thread_group_empty(p)); +			unlock_task_sighand(p, &flags);  		} -		read_unlock(&tasklist_lock);  	} -	if (unlikely(clear_dead)) { -		/* -		 * We've noticed that the thread is dead, but -		 * not yet reaped.  Take this opportunity to -		 * drop our task ref. -		 */ -		clear_dead_task(timer, now); -		goto dead; -	} - -	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { +	if (now < timer->it.cpu.expires) {  		sample_to_timespec(timer->it_clock, -				   cpu_time_sub(timer->it_clock, -						timer->it.cpu.expires, now), +				   timer->it.cpu.expires - now,  				   &itp->it_value);  	} else {  		/* @@ -900,6 +818,28 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  	}  } +static unsigned long long +check_timers_list(struct list_head *timers, +		  struct list_head *firing, +		  unsigned long long curr) +{ +	int maxfire = 20; + +	while (!list_empty(timers)) { +		struct cpu_timer_list *t; + +		t = list_first_entry(timers, struct cpu_timer_list, entry); + +		if (!--maxfire || curr < t->expires) +			return t->expires; + +		t->firing = 1; +		list_move_tail(&t->entry, firing); +	} + +	return 0; +} +  /*   * Check for any per-thread CPU timers that have fired and move them off   * the tsk->cpu_timers[N] list onto the firing list.  Here we update the @@ -908,54 +848,20 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  static void check_thread_timers(struct task_struct *tsk,  				struct list_head *firing)  { -	int maxfire;  	struct list_head *timers = tsk->cpu_timers;  	struct signal_struct *const sig = tsk->signal; +	struct task_cputime *tsk_expires = &tsk->cputime_expires; +	unsigned long long expires;  	unsigned long soft; -	maxfire = 20; -	tsk->cputime_expires.prof_exp = cputime_zero; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { -			tsk->cputime_expires.prof_exp = t->expires.cpu; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	expires = check_timers_list(timers, firing, prof_ticks(tsk)); +	tsk_expires->prof_exp = expires_to_cputime(expires); -	++timers; -	maxfire = 20; -	tsk->cputime_expires.virt_exp = cputime_zero; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { -			tsk->cputime_expires.virt_exp = t->expires.cpu; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	expires = check_timers_list(++timers, firing, virt_ticks(tsk)); +	tsk_expires->virt_exp = expires_to_cputime(expires); -	++timers; -	maxfire = 20; -	tsk->cputime_expires.sched_exp = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { -			tsk->cputime_expires.sched_exp = t->expires.sched; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	tsk_expires->sched_exp = check_timers_list(++timers, firing, +						   tsk->se.sum_exec_runtime);  	/*  	 * Check for the special case thread timers. @@ -995,30 +901,30 @@ static void stop_process_timers(struct signal_struct *sig)  	struct thread_group_cputimer *cputimer = &sig->cputimer;  	unsigned long flags; -	spin_lock_irqsave(&cputimer->lock, flags); +	raw_spin_lock_irqsave(&cputimer->lock, flags);  	cputimer->running = 0; -	spin_unlock_irqrestore(&cputimer->lock, flags); +	raw_spin_unlock_irqrestore(&cputimer->lock, flags);  }  static u32 onecputick;  static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, -			     cputime_t *expires, cputime_t cur_time, int signo) +			     unsigned long long *expires, +			     unsigned long long cur_time, int signo)  { -	if (cputime_eq(it->expires, cputime_zero)) +	if (!it->expires)  		return; -	if (cputime_ge(cur_time, it->expires)) { -		if (!cputime_eq(it->incr, cputime_zero)) { -			it->expires = cputime_add(it->expires, it->incr); +	if (cur_time >= it->expires) { +		if (it->incr) { +			it->expires += it->incr;  			it->error += it->incr_error;  			if (it->error >= onecputick) { -				it->expires = cputime_sub(it->expires, -							  cputime_one_jiffy); +				it->expires -= cputime_one_jiffy;  				it->error -= onecputick;  			}  		} else { -			it->expires = cputime_zero; +			it->expires = 0;  		}  		trace_itimer_expire(signo == SIGPROF ? @@ -1027,30 +933,11 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,  		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);  	} -	if (!cputime_eq(it->expires, cputime_zero) && -	    (cputime_eq(*expires, cputime_zero) || -	     cputime_lt(it->expires, *expires))) { +	if (it->expires && (!*expires || it->expires < *expires)) {  		*expires = it->expires;  	}  } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime:	The struct to compare. - * - * Checks @cputime to see if all fields are zero.  Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ -	if (cputime_eq(cputime->utime, cputime_zero) && -	    cputime_eq(cputime->stime, cputime_zero) && -	    cputime->sum_exec_runtime == 0) -		return 1; -	return 0; -} -  /*   * Check for any per-thread CPU timers that have fired and move them   * off the tsk->*_timers list onto the firing list.  Per-thread timers @@ -1059,9 +946,8 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)  static void check_process_timers(struct task_struct *tsk,  				 struct list_head *firing)  { -	int maxfire;  	struct signal_struct *const sig = tsk->signal; -	cputime_t utime, ptime, virt_expires, prof_expires; +	unsigned long long utime, ptime, virt_expires, prof_expires;  	unsigned long long sum_sched_runtime, sched_expires;  	struct list_head *timers = sig->cpu_timers;  	struct task_cputime cputime; @@ -1071,52 +957,13 @@ static void check_process_timers(struct task_struct *tsk,  	 * Collect the current process totals.  	 */  	thread_group_cputimer(tsk, &cputime); -	utime = cputime.utime; -	ptime = cputime_add(utime, cputime.stime); +	utime = cputime_to_expires(cputime.utime); +	ptime = utime + cputime_to_expires(cputime.stime);  	sum_sched_runtime = cputime.sum_exec_runtime; -	maxfire = 20; -	prof_expires = cputime_zero; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { -			prof_expires = tl->expires.cpu; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} - -	++timers; -	maxfire = 20; -	virt_expires = cputime_zero; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { -			virt_expires = tl->expires.cpu; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} -	++timers; -	maxfire = 20; -	sched_expires = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || sum_sched_runtime < tl->expires.sched) { -			sched_expires = tl->expires.sched; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} +	prof_expires = check_timers_list(timers, firing, ptime); +	virt_expires = check_timers_list(++timers, firing, utime); +	sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);  	/*  	 * Check for the special case process timers. @@ -1150,14 +997,13 @@ static void check_process_timers(struct task_struct *tsk,  			}  		}  		x = secs_to_cputime(soft); -		if (cputime_eq(prof_expires, cputime_zero) || -		    cputime_lt(x, prof_expires)) { +		if (!prof_expires || x < prof_expires) {  			prof_expires = x;  		}  	} -	sig->cputime_expires.prof_exp = prof_expires; -	sig->cputime_expires.virt_exp = virt_expires; +	sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); +	sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);  	sig->cputime_expires.sched_exp = sched_expires;  	if (task_cputime_zero(&sig->cputime_expires))  		stop_process_timers(sig); @@ -1169,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk,   */  void posix_cpu_timer_schedule(struct k_itimer *timer)  { +	struct sighand_struct *sighand; +	unsigned long flags;  	struct task_struct *p = timer->it.cpu.task; -	union cpu_time_count now; +	unsigned long long now; -	if (unlikely(p == NULL)) -		/* -		 * The task was cleaned up already, no future firings. -		 */ -		goto out; +	WARN_ON_ONCE(p == NULL);  	/*  	 * Fetch the current sample and update the timer's expiry time. @@ -1184,48 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)  	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {  		cpu_clock_sample(timer->it_clock, p, &now);  		bump_cpu_timer(timer, now); -		if (unlikely(p->exit_state)) { -			clear_dead_task(timer, now); +		if (unlikely(p->exit_state)) +			goto out; + +		/* Protect timer list r/w in arm_timer() */ +		sighand = lock_task_sighand(p, &flags); +		if (!sighand)  			goto out; -		} -		read_lock(&tasklist_lock); /* arm_timer needs it.  */ -		spin_lock(&p->sighand->siglock);  	} else { -		read_lock(&tasklist_lock); -		if (unlikely(p->sighand == NULL)) { +		/* +		 * Protect arm_timer() and timer sampling in case of call to +		 * thread_group_cputime(). +		 */ +		sighand = lock_task_sighand(p, &flags); +		if (unlikely(sighand == NULL)) {  			/*  			 * The process has been reaped.  			 * We can't even collect a sample any more.  			 */ -			put_task_struct(p); -			timer->it.cpu.task = p = NULL; -			timer->it.cpu.expires.sched = 0; -			goto out_unlock; +			timer->it.cpu.expires = 0; +			goto out;  		} else if (unlikely(p->exit_state) && thread_group_empty(p)) { -			/* -			 * We've noticed that the thread is dead, but -			 * not yet reaped.  Take this opportunity to -			 * drop our task ref. -			 */ -			clear_dead_task(timer, now); -			goto out_unlock; +			unlock_task_sighand(p, &flags); +			/* Optimizations: if the process is dying, no need to rearm */ +			goto out;  		} -		spin_lock(&p->sighand->siglock);  		cpu_timer_sample_group(timer->it_clock, p, &now);  		bump_cpu_timer(timer, now); -		/* Leave the tasklist_lock locked for the call below.  */ +		/* Leave the sighand locked for the call below.  */  	}  	/*  	 * Now re-arm for the new expiry time.  	 */ -	BUG_ON(!irqs_disabled()); +	WARN_ON_ONCE(!irqs_disabled());  	arm_timer(timer); -	spin_unlock(&p->sighand->siglock); - -out_unlock: -	read_unlock(&tasklist_lock); +	unlock_task_sighand(p, &flags); +	/* Kick full dynticks CPUs in case they need to tick on the new timer */ +	posix_cpu_timer_kick_nohz();  out:  	timer->it_overrun_last = timer->it_overrun;  	timer->it_overrun = -1; @@ -1245,12 +1086,9 @@ out:  static inline int task_cputime_expired(const struct task_cputime *sample,  					const struct task_cputime *expires)  { -	if (!cputime_eq(expires->utime, cputime_zero) && -	    cputime_ge(sample->utime, expires->utime)) +	if (expires->utime && sample->utime >= expires->utime)  		return 1; -	if (!cputime_eq(expires->stime, cputime_zero) && -	    cputime_ge(cputime_add(sample->utime, sample->stime), -		       expires->stime)) +	if (expires->stime && sample->utime + sample->stime >= expires->stime)  		return 1;  	if (expires->sum_exec_runtime != 0 &&  	    sample->sum_exec_runtime >= expires->sum_exec_runtime) @@ -1271,11 +1109,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,  static inline int fastpath_timer_check(struct task_struct *tsk)  {  	struct signal_struct *sig; +	cputime_t utime, stime; + +	task_cputime(tsk, &utime, &stime);  	if (!task_cputime_zero(&tsk->cputime_expires)) {  		struct task_cputime task_sample = { -			.utime = tsk->utime, -			.stime = tsk->stime, +			.utime = utime, +			.stime = stime,  			.sum_exec_runtime = tsk->se.sum_exec_runtime  		}; @@ -1287,9 +1128,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)  	if (sig->cputimer.running) {  		struct task_cputime group_sample; -		spin_lock(&sig->cputimer.lock); +		raw_spin_lock(&sig->cputimer.lock);  		group_sample = sig->cputimer.cputime; -		spin_unlock(&sig->cputimer.lock); +		raw_spin_unlock(&sig->cputimer.lock);  		if (task_cputime_expired(&group_sample, &sig->cputime_expires))  			return 1; @@ -1309,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)  	struct k_itimer *timer, *next;  	unsigned long flags; -	BUG_ON(!irqs_disabled()); +	WARN_ON_ONCE(!irqs_disabled());  	/*  	 * The fast path checks that there are no expired thread or thread @@ -1345,7 +1186,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)  	/*  	 * Now that all the timers on our list have the firing flag, -	 * noone will touch their list entries but us.  We'll take +	 * no one will touch their list entries but us.  We'll take  	 * each timer's lock before clearing its firing flag, so no  	 * timer call will interfere.  	 */ @@ -1374,9 +1215,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)  void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  			   cputime_t *newval, cputime_t *oldval)  { -	union cpu_time_count now; +	unsigned long long now; -	BUG_ON(clock_idx == CPUCLOCK_SCHED); +	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);  	cpu_timer_sample_group(clock_idx, tsk, &now);  	if (oldval) { @@ -1385,18 +1226,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  		 * it to be relative, *newval argument is relative and we update  		 * it to be absolute.  		 */ -		if (!cputime_eq(*oldval, cputime_zero)) { -			if (cputime_le(*oldval, now.cpu)) { +		if (*oldval) { +			if (*oldval <= now) {  				/* Just about to fire. */  				*oldval = cputime_one_jiffy;  			} else { -				*oldval = cputime_sub(*oldval, now.cpu); +				*oldval -= now;  			}  		} -		if (cputime_eq(*newval, cputime_zero)) -			return; -		*newval = cputime_add(*newval, now.cpu); +		if (!*newval) +			goto out; +		*newval += now;  	}  	/* @@ -1413,6 +1254,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  			tsk->signal->cputime_expires.virt_exp = *newval;  		break;  	} +out: +	posix_cpu_timer_kick_nohz();  }  static int do_cpu_nanosleep(const clockid_t which_clock, int flags, @@ -1444,10 +1287,12 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		}  		while (!signal_pending(current)) { -			if (timer.it.cpu.expires.sched == 0) { +			if (timer.it.cpu.expires == 0) {  				/* -				 * Our timer fired and was reset. +				 * Our timer fired and was reset, below +				 * deletion can not fail.  				 */ +				posix_cpu_timer_del(&timer);  				spin_unlock_irq(&timer.it_lock);  				return 0;  			} @@ -1465,9 +1310,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		 * We were interrupted by a signal.  		 */  		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); -		posix_cpu_timer_set(&timer, 0, &zero_it, it); +		error = posix_cpu_timer_set(&timer, 0, &zero_it, it); +		if (!error) { +			/* +			 * Timer is now unarmed, deletion can not fail. +			 */ +			posix_cpu_timer_del(&timer); +		}  		spin_unlock_irq(&timer.it_lock); +		while (error == TIMER_RETRY) { +			/* +			 * We need to handle case when timer was or is in the +			 * middle of firing. In other cases we already freed +			 * resources. +			 */ +			spin_lock_irq(&timer.it_lock); +			error = posix_cpu_timer_del(&timer); +			spin_unlock_irq(&timer.it_lock); +		} +  		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {  			/*  			 * It actually did fire already. @@ -1481,11 +1343,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  	return error;  } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, -		     struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, +			    struct timespec *rqtp, struct timespec __user *rmtp)  {  	struct restart_block *restart_block = -	    ¤t_thread_info()->restart_block; +		¤t_thread_info()->restart_block;  	struct itimerspec it;  	int error; @@ -1501,56 +1365,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,  	if (error == -ERESTART_RESTARTBLOCK) { -	       	if (flags & TIMER_ABSTIME) +		if (flags & TIMER_ABSTIME)  			return -ERESTARTNOHAND;  		/* -	 	 * Report back to the user the time still remaining. -	 	 */ -		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) +		 * Report back to the user the time still remaining. +		 */ +		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))  			return -EFAULT;  		restart_block->fn = posix_cpu_nsleep_restart; -		restart_block->arg0 = which_clock; -		restart_block->arg1 = (unsigned long) rmtp; -		restart_block->arg2 = rqtp->tv_sec; -		restart_block->arg3 = rqtp->tv_nsec; +		restart_block->nanosleep.clockid = which_clock; +		restart_block->nanosleep.rmtp = rmtp; +		restart_block->nanosleep.expires = timespec_to_ns(rqtp);  	}  	return error;  } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block)  { -	clockid_t which_clock = restart_block->arg0; -	struct timespec __user *rmtp; +	clockid_t which_clock = restart_block->nanosleep.clockid;  	struct timespec t;  	struct itimerspec it;  	int error; -	rmtp = (struct timespec __user *) restart_block->arg1; -	t.tv_sec = restart_block->arg2; -	t.tv_nsec = restart_block->arg3; +	t = ns_to_timespec(restart_block->nanosleep.expires); -	restart_block->fn = do_no_restart_syscall;  	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);  	if (error == -ERESTART_RESTARTBLOCK) { +		struct timespec __user *rmtp = restart_block->nanosleep.rmtp;  		/* -	 	 * Report back to the user the time still remaining. -	 	 */ -		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) +		 * Report back to the user the time still remaining. +		 */ +		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))  			return -EFAULT; -		restart_block->fn = posix_cpu_nsleep_restart; -		restart_block->arg0 = which_clock; -		restart_block->arg1 = (unsigned long) rmtp; -		restart_block->arg2 = t.tv_sec; -		restart_block->arg3 = t.tv_nsec; +		restart_block->nanosleep.expires = timespec_to_ns(&t);  	}  	return error;  } -  #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)  #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) @@ -1594,38 +1449,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)  	timer->it_clock = THREAD_CLOCK;  	return posix_cpu_timer_create(timer);  } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, -			      struct timespec *rqtp, struct timespec __user *rmtp) -{ -	return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ -	return -EINVAL; -} + +struct k_clock clock_posix_cpu = { +	.clock_getres	= posix_cpu_clock_getres, +	.clock_set	= posix_cpu_clock_set, +	.clock_get	= posix_cpu_clock_get, +	.timer_create	= posix_cpu_timer_create, +	.nsleep		= posix_cpu_nsleep, +	.nsleep_restart	= posix_cpu_nsleep_restart, +	.timer_set	= posix_cpu_timer_set, +	.timer_del	= posix_cpu_timer_del, +	.timer_get	= posix_cpu_timer_get, +};  static __init int init_posix_cpu_timers(void)  {  	struct k_clock process = { -		.clock_getres = process_cpu_clock_getres, -		.clock_get = process_cpu_clock_get, -		.clock_set = do_posix_clock_nosettime, -		.timer_create = process_cpu_timer_create, -		.nsleep = process_cpu_nsleep, -		.nsleep_restart = process_cpu_nsleep_restart, +		.clock_getres	= process_cpu_clock_getres, +		.clock_get	= process_cpu_clock_get, +		.timer_create	= process_cpu_timer_create, +		.nsleep		= process_cpu_nsleep, +		.nsleep_restart	= process_cpu_nsleep_restart,  	};  	struct k_clock thread = { -		.clock_getres = thread_cpu_clock_getres, -		.clock_get = thread_cpu_clock_get, -		.clock_set = do_posix_clock_nosettime, -		.timer_create = thread_cpu_timer_create, -		.nsleep = thread_cpu_nsleep, -		.nsleep_restart = thread_cpu_nsleep_restart, +		.clock_getres	= thread_cpu_clock_getres, +		.clock_get	= thread_cpu_clock_get, +		.timer_create	= thread_cpu_timer_create,  	};  	struct timespec ts; -	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); -	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); +	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);  	cputime_to_timespec(cputime_one_jiffy, &ts);  	onecputick = ts.tv_nsec;  | 
