diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Kconfig.locks | 2 | ||||
| -rw-r--r-- | kernel/auditsc.c | 27 | ||||
| -rw-r--r-- | kernel/capability.c | 4 | ||||
| -rw-r--r-- | kernel/compat.c | 8 | ||||
| -rw-r--r-- | kernel/cred.c | 6 | ||||
| -rw-r--r-- | kernel/fork.c | 42 | ||||
| -rw-r--r-- | kernel/hrtimer.c | 162 | ||||
| -rw-r--r-- | kernel/irq/proc.c | 54 | ||||
| -rw-r--r-- | kernel/kmod.c | 100 | ||||
| -rw-r--r-- | kernel/module.c | 4 | ||||
| -rw-r--r-- | kernel/mutex.c | 25 | ||||
| -rw-r--r-- | kernel/nsproxy.c | 42 | ||||
| -rw-r--r-- | kernel/pm_qos_params.c | 2 | ||||
| -rw-r--r-- | kernel/posix-cpu-timers.c | 4 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 27 | ||||
| -rw-r--r-- | kernel/printk.c | 87 | ||||
| -rw-r--r-- | kernel/profile.c | 6 | ||||
| -rw-r--r-- | kernel/ptrace.c | 2 | ||||
| -rw-r--r-- | kernel/sched.c | 41 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 52 | ||||
| -rw-r--r-- | kernel/signal.c | 6 | ||||
| -rw-r--r-- | kernel/sysctl.c | 20 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 16 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 16 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 17 | ||||
| -rw-r--r-- | kernel/utsname.c | 39 | ||||
| -rw-r--r-- | kernel/watchdog.c | 52 | ||||
| -rw-r--r-- | kernel/workqueue.c | 4 | 
28 files changed, 621 insertions, 246 deletions
| diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb4461..5068e2a4e75 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE  	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE  config MUTEX_SPIN_ON_OWNER -	def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES +	def_bool SMP && !DEBUG_MUTEXES diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b33513a08be..00d79df03e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)  /* Determine if any context name data matches a rule's watch data */  /* Compare a task_struct with an audit_rule.  Return 1 on match, 0 - * otherwise. */ + * otherwise. + * + * If task_creation is true, this is an explicit indication that we are + * filtering a task rule at task creation time.  This and tsk == current are + * the only situations where tsk->cred may be accessed without an rcu read lock. + */  static int audit_filter_rules(struct task_struct *tsk,  			      struct audit_krule *rule,  			      struct audit_context *ctx,  			      struct audit_names *name, -			      enum audit_state *state) +			      enum audit_state *state, +			      bool task_creation)  { -	const struct cred *cred = get_task_cred(tsk); +	const struct cred *cred;  	int i, j, need_sid = 1;  	u32 sid; +	cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); +  	for (i = 0; i < rule->field_count; i++) {  		struct audit_field *f = &rule->fields[i];  		int result = 0; @@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,  			break;  		} -		if (!result) { -			put_cred(cred); +		if (!result)  			return 0; -		}  	}  	if (ctx) { @@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,  	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;  	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;  	} -	put_cred(cred);  	return 1;  } @@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)  	rcu_read_lock();  	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { -		if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { +		if (audit_filter_rules(tsk, &e->rule, NULL, NULL, +				       &state, true)) {  			if (state == AUDIT_RECORD_CONTEXT)  				*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);  			rcu_read_unlock(); @@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,  		list_for_each_entry_rcu(e, list, list) {  			if ((e->rule.mask[word] & bit) == bit &&  			    audit_filter_rules(tsk, &e->rule, ctx, NULL, -					       &state)) { +					       &state, false)) {  				rcu_read_unlock();  				ctx->current_state = state;  				return state; @@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)  		list_for_each_entry_rcu(e, list, list) {  			if ((e->rule.mask[word] & bit) == bit && -			    audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { +			    audit_filter_rules(tsk, &e->rule, ctx, n, +				    	       &state, false)) {  				rcu_read_unlock();  				ctx->current_state = state;  				return; diff --git a/kernel/capability.c b/kernel/capability.c index 32a80e08ff4..283c529f8b1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -22,12 +22,8 @@   */  const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;  EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set);  int file_caps_enabled = 1; diff --git a/kernel/compat.c b/kernel/compat.c index 9214dcd087b..fc9eb093acd 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)  	return compat_jiffies_to_clock_t(jiffies);  } +#ifdef __ARCH_WANT_SYS_SIGPENDING +  /*   * Assumption: old_sigset_t and compat_old_sigset_t are both   * types that can be passed to put_user()/get_user(). @@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)  	return ret;  } +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK +  asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,  		compat_old_sigset_t __user *oset)  { @@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,  	return ret;  } +#endif +  asmlinkage long compat_sys_setrlimit(unsigned int resource,  		struct compat_rlimit __user *rlim)  { diff --git a/kernel/cred.c b/kernel/cred.c index 8093c16b84b..e12c8af793f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,10 +49,10 @@ struct cred init_cred = {  	.magic			= CRED_MAGIC,  #endif  	.securebits		= SECUREBITS_DEFAULT, -	.cap_inheritable	= CAP_INIT_INH_SET, +	.cap_inheritable	= CAP_EMPTY_SET,  	.cap_permitted		= CAP_FULL_SET, -	.cap_effective		= CAP_INIT_EFF_SET, -	.cap_bset		= CAP_INIT_BSET, +	.cap_effective		= CAP_FULL_SET, +	.cap_bset		= CAP_FULL_SET,  	.user			= INIT_USER,  	.user_ns		= &init_user_ns,  	.group_info		= &init_groups, diff --git a/kernel/fork.c b/kernel/fork.c index 2b44d82b823..8e7e135d081 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  			get_file(file);  			if (tmp->vm_flags & VM_DENYWRITE)  				atomic_dec(&inode->i_writecount); -			spin_lock(&mapping->i_mmap_lock); +			mutex_lock(&mapping->i_mmap_mutex);  			if (tmp->vm_flags & VM_SHARED)  				mapping->i_mmap_writable++; -			tmp->vm_truncate_count = mpnt->vm_truncate_count;  			flush_dcache_mmap_lock(mapping);  			/* insert tmp into the share list, just after mpnt */  			vma_prio_tree_add(tmp, mpnt);  			flush_dcache_mmap_unlock(mapping); -			spin_unlock(&mapping->i_mmap_lock); +			mutex_unlock(&mapping->i_mmap_mutex);  		}  		/* @@ -486,6 +485,20 @@ static void mm_init_aio(struct mm_struct *mm)  #endif  } +int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK +	if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) +		return -ENOMEM; + +	if (oldmm) +		cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); +	else +		memset(mm_cpumask(mm), 0, cpumask_size()); +#endif +	return 0; +} +  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)  {  	atomic_set(&mm->mm_users, 1); @@ -522,10 +535,20 @@ struct mm_struct * mm_alloc(void)  	struct mm_struct * mm;  	mm = allocate_mm(); -	if (mm) { -		memset(mm, 0, sizeof(*mm)); -		mm = mm_init(mm, current); +	if (!mm) +		return NULL; + +	memset(mm, 0, sizeof(*mm)); +	mm = mm_init(mm, current); +	if (!mm) +		return NULL; + +	if (mm_init_cpumask(mm, NULL)) { +		mm_free_pgd(mm); +		free_mm(mm); +		return NULL;  	} +  	return mm;  } @@ -537,6 +560,7 @@ struct mm_struct * mm_alloc(void)  void __mmdrop(struct mm_struct *mm)  {  	BUG_ON(mm == &init_mm); +	free_cpumask_var(mm->cpu_vm_mask_var);  	mm_free_pgd(mm);  	destroy_context(mm);  	mmu_notifier_mm_destroy(mm); @@ -691,6 +715,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)  	if (!mm_init(mm, tsk))  		goto fail_nomem; +	if (mm_init_cpumask(mm, oldmm)) +		goto fail_nocpumask; +  	if (init_new_context(tsk, mm))  		goto fail_nocontext; @@ -717,6 +744,9 @@ fail_nomem:  	return NULL;  fail_nocontext: +	free_cpumask_var(mm->cpu_vm_mask_var); + +fail_nocpumask:  	/*  	 * If init_new_context() failed, we cannot use mmput() to free the mm  	 * because it calls destroy_context() diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index dbbbf7d4308..a9205e32a05 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -64,17 +64,20 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =  	.clock_base =  	{  		{ -			.index = CLOCK_REALTIME, -			.get_time = &ktime_get_real, +			.index = HRTIMER_BASE_MONOTONIC, +			.clockid = CLOCK_MONOTONIC, +			.get_time = &ktime_get,  			.resolution = KTIME_LOW_RES,  		},  		{ -			.index = CLOCK_MONOTONIC, -			.get_time = &ktime_get, +			.index = HRTIMER_BASE_REALTIME, +			.clockid = CLOCK_REALTIME, +			.get_time = &ktime_get_real,  			.resolution = KTIME_LOW_RES,  		},  		{ -			.index = CLOCK_BOOTTIME, +			.index = HRTIMER_BASE_BOOTTIME, +			.clockid = CLOCK_BOOTTIME,  			.get_time = &ktime_get_boottime,  			.resolution = KTIME_LOW_RES,  		}, @@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,  	struct hrtimer_cpu_base *new_cpu_base;  	int this_cpu = smp_processor_id();  	int cpu = hrtimer_get_target(this_cpu, pinned); -	int basenum = hrtimer_clockid_to_base(base->index); +	int basenum = base->index;  again:  	new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,  	return res;  } - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ -	struct hrtimer_cpu_base *base; -	struct timespec realtime_offset, wtm, sleep; - -	if (!hrtimer_hres_active()) -		return; - -	get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, -							&sleep); -	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - -	base = &__get_cpu_var(hrtimer_bases); - -	/* Adjust CLOCK_REALTIME offset */ -	raw_spin_lock(&base->lock); -	base->clock_base[HRTIMER_BASE_REALTIME].offset = -		timespec_to_ktime(realtime_offset); -	base->clock_base[HRTIMER_BASE_BOOTTIME].offset = -		timespec_to_ktime(sleep); - -	hrtimer_force_reprogram(base, 0); -	raw_spin_unlock(&base->lock); -} - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ -	/* Retrigger the CPU local events everywhere */ -	on_each_cpu(retrigger_next_event, NULL, 1); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hres_timers_resume(void) -{ -	WARN_ONCE(!irqs_disabled(), -		  KERN_INFO "hres_timers_resume() called with IRQs enabled!"); - -	retrigger_next_event(NULL); -} -  /*   * Initialize the high resolution related parts of cpu_base   */ @@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,  }  /* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ +	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); +	struct timespec realtime_offset, xtim, wtm, sleep; + +	if (!hrtimer_hres_active()) +		return; + +	/* Optimized out for !HIGH_RES */ +	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); +	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); + +	/* Adjust CLOCK_REALTIME offset */ +	raw_spin_lock(&base->lock); +	base->clock_base[HRTIMER_BASE_REALTIME].offset = +		timespec_to_ktime(realtime_offset); +	base->clock_base[HRTIMER_BASE_BOOTTIME].offset = +		timespec_to_ktime(sleep); + +	hrtimer_force_reprogram(base, 0); +	raw_spin_unlock(&base->lock); +} + +/*   * Switch to high resolution mode   */  static int hrtimer_switch_to_hres(void)  { -	int cpu = smp_processor_id(); +	int i, cpu = smp_processor_id();  	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);  	unsigned long flags; @@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)  		return 0;  	}  	base->hres_active = 1; -	base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; -	base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; -	base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; +	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) +		base->clock_base[i].resolution = KTIME_HIGH_RES;  	tick_setup_sched_timer(); @@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,  	return 0;  }  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void retrigger_next_event(void *arg) { }  #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS +	/* Retrigger the CPU local events everywhere */ +	on_each_cpu(retrigger_next_event, NULL, 1); +#endif +	timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt (on the local CPU): + */ +void hrtimers_resume(void) +{ +	WARN_ONCE(!irqs_disabled(), +		  KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + +	retrigger_next_event(NULL); +	timerfd_clock_was_set(); +} +  static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)  {  #ifdef CONFIG_TIMER_STATS @@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,  	debug_activate(timer);  	timerqueue_add(&base->active, &timer->node); +	base->cpu_base->active_bases |= 1 << base->index;  	/*  	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the @@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,  #endif  	}  	timerqueue_del(&base->active, &timer->node); +	if (!timerqueue_getnext(&base->active)) +		base->cpu_base->active_bases &= ~(1 << base->index);  out:  	timer->state = newstate;  } @@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)  void hrtimer_interrupt(struct clock_event_device *dev)  {  	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); -	struct hrtimer_clock_base *base;  	ktime_t expires_next, now, entry_time, delta;  	int i, retries = 0; @@ -1256,12 +1262,15 @@ retry:  	 */  	cpu_base->expires_next.tv64 = KTIME_MAX; -	base = cpu_base->clock_base; -  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { -		ktime_t basenow; +		struct hrtimer_clock_base *base;  		struct timerqueue_node *node; +		ktime_t basenow; + +		if (!(cpu_base->active_bases & (1 << i))) +			continue; +		base = cpu_base->clock_base + i;  		basenow = ktime_add(now, base->offset);  		while ((node = timerqueue_getnext(&base->active))) { @@ -1294,7 +1303,6 @@ retry:  			__run_hrtimer(timer, &basenow);  		} -		base++;  	}  	/* @@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)  	struct timespec __user  *rmtp;  	int ret = 0; -	hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, +	hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,  				HRTIMER_MODE_ABS);  	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); @@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,  	restart = ¤t_thread_info()->restart_block;  	restart->fn = hrtimer_nanosleep_restart; -	restart->nanosleep.index = t.timer.base->index; +	restart->nanosleep.clockid = t.timer.base->clockid;  	restart->nanosleep.rmtp = rmtp;  	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 834899f2500..64e3df6ab1e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;  #ifdef CONFIG_SMP -static int irq_affinity_proc_show(struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m, void *v)  {  	struct irq_desc *desc = irq_to_desc((long)m->private);  	const struct cpumask *mask = desc->irq_data.affinity; @@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)  	if (irqd_is_setaffinity_pending(&desc->irq_data))  		mask = desc->pending_mask;  #endif -	seq_cpumask(m, mask); +	if (type) +		seq_cpumask_list(m, mask); +	else +		seq_cpumask(m, mask);  	seq_putc(m, '\n');  	return 0;  } @@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)  #endif  int no_irq_affinity; -static ssize_t irq_affinity_proc_write(struct file *file, +static int irq_affinity_proc_show(struct seq_file *m, void *v) +{ +	return show_irq_affinity(0, m, v); +} + +static int irq_affinity_list_proc_show(struct seq_file *m, void *v) +{ +	return show_irq_affinity(1, m, v); +} + + +static ssize_t write_irq_affinity(int type, struct file *file,  		const char __user *buffer, size_t count, loff_t *pos)  {  	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; @@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,  	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))  		return -ENOMEM; -	err = cpumask_parse_user(buffer, count, new_value); +	if (type) +		err = cpumask_parselist_user(buffer, count, new_value); +	else +		err = cpumask_parse_user(buffer, count, new_value);  	if (err)  		goto free_cpumask; @@ -100,11 +117,28 @@ free_cpumask:  	return err;  } +static ssize_t irq_affinity_proc_write(struct file *file, +		const char __user *buffer, size_t count, loff_t *pos) +{ +	return write_irq_affinity(0, file, buffer, count, pos); +} + +static ssize_t irq_affinity_list_proc_write(struct file *file, +		const char __user *buffer, size_t count, loff_t *pos) +{ +	return write_irq_affinity(1, file, buffer, count, pos); +} +  static int irq_affinity_proc_open(struct inode *inode, struct file *file)  {  	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);  } +static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) +{ +	return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); +} +  static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)  {  	return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); @@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {  	.release	= single_release,  }; +static const struct file_operations irq_affinity_list_proc_fops = { +	.open		= irq_affinity_list_proc_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +	.write		= irq_affinity_list_proc_write, +}; +  static int default_affinity_show(struct seq_file *m, void *v)  {  	seq_cpumask(m, irq_default_affinity); @@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)  	proc_create_data("affinity_hint", 0400, desc->dir,  			 &irq_affinity_hint_proc_fops, (void *)(long)irq); +	/* create /proc/irq/<irq>/smp_affinity_list */ +	proc_create_data("smp_affinity_list", 0600, desc->dir, +			 &irq_affinity_list_proc_fops, (void *)(long)irq); +  	proc_create_data("node", 0444, desc->dir,  			 &irq_node_proc_fops, (void *)(long)irq);  #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index 5ae0ff38425..ad6a81c58b4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,7 @@  #include <linux/kmod.h>  #include <linux/slab.h>  #include <linux/completion.h> +#include <linux/cred.h>  #include <linux/file.h>  #include <linux/fdtable.h>  #include <linux/workqueue.h> @@ -43,6 +44,13 @@ extern int max_threads;  static struct workqueue_struct *khelper_wq; +#define CAP_BSET	(void *)1 +#define CAP_PI		(void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); +  #ifdef CONFIG_MODULES  /* @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);  static int ____call_usermodehelper(void *data)  {  	struct subprocess_info *sub_info = data; +	struct cred *new;  	int retval;  	spin_lock_irq(¤t->sighand->siglock); @@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data)  			goto fail;  	} +	retval = -ENOMEM; +	new = prepare_kernel_cred(current); +	if (!new) +		goto fail; + +	spin_lock(&umh_sysctl_lock); +	new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); +	new->cap_inheritable = cap_intersect(usermodehelper_inheritable, +					     new->cap_inheritable); +	spin_unlock(&umh_sysctl_lock); + +	commit_creds(new); +  	retval = kernel_execve(sub_info->path,  			       (const char *const *)sub_info->argv,  			       (const char *const *)sub_info->envp); @@ -420,6 +442,84 @@ unlock:  }  EXPORT_SYMBOL(call_usermodehelper_exec); +static int proc_cap_handler(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table t; +	unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; +	kernel_cap_t new_cap; +	int err, i; + +	if (write && (!capable(CAP_SETPCAP) || +		      !capable(CAP_SYS_MODULE))) +		return -EPERM; + +	/* +	 * convert from the global kernel_cap_t to the ulong array to print to +	 * userspace if this is a read. +	 */ +	spin_lock(&umh_sysctl_lock); +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  { +		if (table->data == CAP_BSET) +			cap_array[i] = usermodehelper_bset.cap[i]; +		else if (table->data == CAP_PI) +			cap_array[i] = usermodehelper_inheritable.cap[i]; +		else +			BUG(); +	} +	spin_unlock(&umh_sysctl_lock); + +	t = *table; +	t.data = &cap_array; + +	/* +	 * actually read or write and array of ulongs from userspace.  Remember +	 * these are least significant 32 bits first +	 */ +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); +	if (err < 0) +		return err; + +	/* +	 * convert from the sysctl array of ulongs to the kernel_cap_t +	 * internal representation +	 */ +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) +		new_cap.cap[i] = cap_array[i]; + +	/* +	 * Drop everything not in the new_cap (but don't add things) +	 */ +	spin_lock(&umh_sysctl_lock); +	if (write) { +		if (table->data == CAP_BSET) +			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); +		if (table->data == CAP_PI) +			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); +	} +	spin_unlock(&umh_sysctl_lock); + +	return 0; +} + +struct ctl_table usermodehelper_table[] = { +	{ +		.procname	= "bset", +		.data		= CAP_BSET, +		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), +		.mode		= 0600, +		.proc_handler	= proc_cap_handler, +	}, +	{ +		.procname	= "inheritable", +		.data		= CAP_PI, +		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), +		.mode		= 0600, +		.proc_handler	= proc_cap_handler, +	}, +	{ } +}; +  void __init usermodehelper_init(void)  {  	khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/module.c b/kernel/module.c index 22879725678..795bdc7f5c3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2812,7 +2812,7 @@ static struct module *load_module(void __user *umod,  	}  	/* This has to be done once we're sure module name is unique. */ -	if (!mod->taints) +	if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))  		dynamic_debug_setup(info.debug, info.num_debug);  	/* Find duplicate symbols */ @@ -2849,7 +2849,7 @@ static struct module *load_module(void __user *umod,  	module_bug_cleanup(mod);   ddebug: -	if (!mod->taints) +	if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))  		dynamic_debug_remove(info.debug);   unlock:  	mutex_unlock(&module_mutex); diff --git a/kernel/mutex.c b/kernel/mutex.c index 2c938e2337c..d607ed5dd44 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);   */  static inline int __sched  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, -	       	unsigned long ip) +		    struct lockdep_map *nest_lock, unsigned long ip)  {  	struct task_struct *task = current;  	struct mutex_waiter waiter;  	unsigned long flags;  	preempt_disable(); -	mutex_acquire(&lock->dep_map, subclass, 0, ip); +	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER  	/* @@ -269,16 +269,25 @@ void __sched  mutex_lock_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep(); -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);  }  EXPORT_SYMBOL_GPL(mutex_lock_nested); +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ +	might_sleep(); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); +  int __sched  mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep(); -	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); +	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);  }  EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep();  	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, -				   subclass, _RET_IP_); +				   subclass, NULL, _RET_IP_);  }  EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)  {  	struct mutex *lock = container_of(lock_count, struct mutex, count); -	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);  }  static noinline int __sched @@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)  {  	struct mutex *lock = container_of(lock_count, struct mutex, count); -	return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); +	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);  }  static noinline int __sched @@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)  {  	struct mutex *lock = container_of(lock_count, struct mutex, count); -	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);  }  #endif diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd..5424e37673e 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@  #include <linux/pid_namespace.h>  #include <net/net_namespace.h>  #include <linux/ipc_namespace.h> +#include <linux/proc_fs.h> +#include <linux/file.h> +#include <linux/syscalls.h>  static struct kmem_cache *nsproxy_cachep; @@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p)  	switch_task_namespaces(p, NULL);  } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ +	const struct proc_ns_operations *ops; +	struct task_struct *tsk = current; +	struct nsproxy *new_nsproxy; +	struct proc_inode *ei; +	struct file *file; +	int err; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	file = proc_ns_fget(fd); +	if (IS_ERR(file)) +		return PTR_ERR(file); + +	err = -EINVAL; +	ei = PROC_I(file->f_dentry->d_inode); +	ops = ei->ns_ops; +	if (nstype && (ops->type != nstype)) +		goto out; + +	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); +	if (IS_ERR(new_nsproxy)) { +		err = PTR_ERR(new_nsproxy); +		goto out; +	} + +	err = ops->install(new_nsproxy, ei->ns); +	if (err) { +		free_nsproxy(new_nsproxy); +		goto out; +	} +	switch_task_namespaces(tsk, new_nsproxy); +out: +	fput(file); +	return err; +} +  static int __init nsproxy_cache_init(void)  {  	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0da058bff8e..beb184689af 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -385,7 +385,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,  	s32 value;  	unsigned long flags;  	struct pm_qos_object *o; -	struct pm_qos_request_list *pm_qos_req = filp->private_data;; +	struct pm_qos_request_list *pm_qos_req = filp->private_data;  	if (!pm_qos_req)  		return -EINVAL; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0791b13df7b..58f405b581e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,  			return -EFAULT;  		restart_block->fn = posix_cpu_nsleep_restart; -		restart_block->nanosleep.index = which_clock; +		restart_block->nanosleep.clockid = which_clock;  		restart_block->nanosleep.rmtp = rmtp;  		restart_block->nanosleep.expires = timespec_to_ns(rqtp);  	} @@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,  static long posix_cpu_nsleep_restart(struct restart_block *restart_block)  { -	clockid_t which_clock = restart_block->nanosleep.index; +	clockid_t which_clock = restart_block->nanosleep.clockid;  	struct timespec t;  	struct itimerspec it;  	int error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e5498d7405c..4556182527f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)  	return tmr;  } +static void k_itimer_rcu_free(struct rcu_head *head) +{ +	struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + +	kmem_cache_free(posix_timers_cache, tmr); +} +  #define IT_ID_SET	1  #define IT_ID_NOT_SET	0  static void release_posix_timer(struct k_itimer *tmr, int it_id_set) @@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)  	}  	put_pid(tmr->it_pid);  	sigqueue_free(tmr->sigq); -	kmem_cache_free(posix_timers_cache, tmr); +	call_rcu(&tmr->it.rcu, k_itimer_rcu_free);  }  static struct k_clock *clockid_to_kclock(const clockid_t id) @@ -631,22 +638,18 @@ out:  static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)  {  	struct k_itimer *timr; -	/* -	 * Watch out here.  We do a irqsave on the idr_lock and pass the -	 * flags part over to the timer lock.  Must not let interrupts in -	 * while we are moving the lock. -	 */ -	spin_lock_irqsave(&idr_lock, *flags); + +	rcu_read_lock();  	timr = idr_find(&posix_timers_id, (int)timer_id);  	if (timr) { -		spin_lock(&timr->it_lock); +		spin_lock_irqsave(&timr->it_lock, *flags);  		if (timr->it_signal == current->signal) { -			spin_unlock(&idr_lock); +			rcu_read_unlock();  			return timr;  		} -		spin_unlock(&timr->it_lock); +		spin_unlock_irqrestore(&timr->it_lock, *flags);  	} -	spin_unlock_irqrestore(&idr_lock, *flags); +	rcu_read_unlock();  	return NULL;  } @@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,   */  long clock_nanosleep_restart(struct restart_block *restart_block)  { -	clockid_t which_clock = restart_block->nanosleep.index; +	clockid_t which_clock = restart_block->nanosleep.clockid;  	struct k_clock *kc = clockid_to_kclock(which_clock);  	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) diff --git a/kernel/printk.c b/kernel/printk.c index da8ca817eae..35185392173 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@  #include <linux/smp.h>  #include <linux/security.h>  #include <linux/bootmem.h> +#include <linux/memblock.h>  #include <linux/syscalls.h>  #include <linux/kexec.h>  #include <linux/kdb.h> @@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)  }  #endif +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */  static int __init log_buf_len_setup(char *str)  {  	unsigned size = memparse(str, &str); -	unsigned long flags;  	if (size)  		size = roundup_pow_of_two(size); -	if (size > log_buf_len) { -		unsigned start, dest_idx, offset; -		char *new_log_buf; +	if (size > log_buf_len) +		new_log_buf_len = size; -		new_log_buf = alloc_bootmem(size); -		if (!new_log_buf) { -			printk(KERN_WARNING "log_buf_len: allocation failed\n"); -			goto out; -		} +	return 0; +} +early_param("log_buf_len", log_buf_len_setup); -		spin_lock_irqsave(&logbuf_lock, flags); -		log_buf_len = size; -		log_buf = new_log_buf; - -		offset = start = min(con_start, log_start); -		dest_idx = 0; -		while (start != log_end) { -			log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; -			start++; -			dest_idx++; -		} -		log_start -= offset; -		con_start -= offset; -		log_end -= offset; -		spin_unlock_irqrestore(&logbuf_lock, flags); +void __init setup_log_buf(int early) +{ +	unsigned long flags; +	unsigned start, dest_idx, offset; +	char *new_log_buf; +	int free; + +	if (!new_log_buf_len) +		return; + +	if (early) { +		unsigned long mem; -		printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); +		mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); +		if (mem == MEMBLOCK_ERROR) +			return; +		new_log_buf = __va(mem); +	} else { +		new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);  	} -out: -	return 1; -} -__setup("log_buf_len=", log_buf_len_setup); +	if (unlikely(!new_log_buf)) { +		pr_err("log_buf_len: %ld bytes not available\n", +			new_log_buf_len); +		return; +	} + +	spin_lock_irqsave(&logbuf_lock, flags); +	log_buf_len = new_log_buf_len; +	log_buf = new_log_buf; +	new_log_buf_len = 0; +	free = __LOG_BUF_LEN - log_end; + +	offset = start = min(con_start, log_start); +	dest_idx = 0; +	while (start != log_end) { +		unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); + +		log_buf[dest_idx] = __log_buf[log_idx_mask]; +		start++; +		dest_idx++; +	} +	log_start -= offset; +	con_start -= offset; +	log_end -= offset; +	spin_unlock_irqrestore(&logbuf_lock, flags); + +	pr_info("log_buf_len: %d\n", log_buf_len); +	pr_info("early log buf free: %d(%d%%)\n", +		free, (free * 100) / __LOG_BUF_LEN); +}  #ifdef CONFIG_BOOT_PRINTK_DELAY diff --git a/kernel/profile.c b/kernel/profile.c index 66f841b7fbd..14c9f87b9fc 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -126,11 +126,9 @@ int __ref profile_init(void)  	if (prof_buffer)  		return 0; -	prof_buffer = vmalloc(buffer_bytes); -	if (prof_buffer) { -		memset(prof_buffer, 0, buffer_bytes); +	prof_buffer = vzalloc(buffer_bytes); +	if (prof_buffer)  		return 0; -	}  	free_cpumask_var(prof_cpu_mask);  	return -ENOMEM; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7a81fc07134..2df115790cd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -562,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,  	}  	child->exit_code = data; -	wake_up_process(child); +	wake_up_state(child, __TASK_TRACED);  	return 0;  } diff --git a/kernel/sched.c b/kernel/sched.c index c62acf45d3b..2d12893b8b0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);   *  limitation from this.)   */  #define MIN_SHARES	2 -#define MAX_SHARES	(1UL << 18) +#define MAX_SHARES	(1UL << (18 + SCHED_LOAD_RESOLUTION))  static int root_task_group_load = ROOT_TASK_GROUP_LOAD;  #endif @@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,  {  	u64 tmp; -	tmp = (u64)delta_exec * weight; +	/* +	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched +	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than +	 * 2^SCHED_LOAD_RESOLUTION. +	 */ +	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) +		tmp = (u64)delta_exec * scale_load_down(weight); +	else +		tmp = (u64)delta_exec;  	if (!lw->inv_weight) { -		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) +		unsigned long w = scale_load_down(lw->weight); + +		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))  			lw->inv_weight = 1; +		else if (unlikely(!w)) +			lw->inv_weight = WMULT_CONST;  		else -			lw->inv_weight = WMULT_CONST / lw->weight; +			lw->inv_weight = WMULT_CONST / w;  	}  	/* @@ -1778,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)  static void set_load_weight(struct task_struct *p)  { +	int prio = p->static_prio - MAX_RT_PRIO; +	struct load_weight *load = &p->se.load; +  	/*  	 * SCHED_IDLE tasks get minimal weight:  	 */  	if (p->policy == SCHED_IDLE) { -		p->se.load.weight = WEIGHT_IDLEPRIO; -		p->se.load.inv_weight = WMULT_IDLEPRIO; +		load->weight = scale_load(WEIGHT_IDLEPRIO); +		load->inv_weight = WMULT_IDLEPRIO;  		return;  	} -	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; -	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; +	load->weight = scale_load(prio_to_weight[prio]); +	load->inv_weight = prio_to_wmult[prio];  }  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2564,7 +2579,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)  {  	struct rq *rq = cpu_rq(cpu); -#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) +#if defined(CONFIG_SMP)  	if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {  		ttwu_queue_remote(p, cpu);  		return; @@ -6527,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));  		printk(KERN_CONT " %s", str); -		if (group->cpu_power != SCHED_LOAD_SCALE) { +		if (group->cpu_power != SCHED_POWER_SCALE) {  			printk(KERN_CONT " (cpu_power = %d)",  				group->cpu_power);  		} @@ -7902,7 +7917,7 @@ void __init sched_init(void)  #ifdef CONFIG_SMP  		rq->sd = NULL;  		rq->rd = NULL; -		rq->cpu_power = SCHED_LOAD_SCALE; +		rq->cpu_power = SCHED_POWER_SCALE;  		rq->post_schedule = 0;  		rq->active_balance = 0;  		rq->next_balance = jiffies; @@ -8806,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,  static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,  				u64 shareval)  { -	return sched_group_set_shares(cgroup_tg(cgrp), shareval); +	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));  }  static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)  {  	struct task_group *tg = cgroup_tg(cgrp); -	return (u64) tg->shares; +	return (u64) scale_load_down(tg->shares);  }  #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37f22626225..e32a9b70ee9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,  		}  		/* Adjust by relative CPU power of the group */ -		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; +		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;  		if (local_group) {  			this_load = avg_load; @@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  				nr_running += cpu_rq(i)->cfs.nr_running;  			} -			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); +			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);  			if (tmp->flags & SD_POWERSAVINGS_BALANCE)  				nr_running /= 2; @@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,  unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)  { -	return SCHED_LOAD_SCALE; +	return SCHED_POWER_SCALE;  }  unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) @@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu)  		available = total - rq->rt_avg;  	} -	if (unlikely((s64)total < SCHED_LOAD_SCALE)) -		total = SCHED_LOAD_SCALE; +	if (unlikely((s64)total < SCHED_POWER_SCALE)) +		total = SCHED_POWER_SCALE; -	total >>= SCHED_LOAD_SHIFT; +	total >>= SCHED_POWER_SHIFT;  	return div_u64(available, total);  } @@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu)  static void update_cpu_power(struct sched_domain *sd, int cpu)  {  	unsigned long weight = sd->span_weight; -	unsigned long power = SCHED_LOAD_SCALE; +	unsigned long power = SCHED_POWER_SCALE;  	struct sched_group *sdg = sd->groups;  	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { @@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)  		else  			power *= default_scale_smt_power(sd, cpu); -		power >>= SCHED_LOAD_SHIFT; +		power >>= SCHED_POWER_SHIFT;  	}  	sdg->cpu_power_orig = power; @@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)  	else  		power *= default_scale_freq_power(sd, cpu); -	power >>= SCHED_LOAD_SHIFT; +	power >>= SCHED_POWER_SHIFT;  	power *= scale_rt_power(cpu); -	power >>= SCHED_LOAD_SHIFT; +	power >>= SCHED_POWER_SHIFT;  	if (!power)  		power = 1; @@ -2682,7 +2682,7 @@ static inline int  fix_small_capacity(struct sched_domain *sd, struct sched_group *group)  {  	/* -	 * Only siblings can have significantly less than SCHED_LOAD_SCALE +	 * Only siblings can have significantly less than SCHED_POWER_SCALE  	 */  	if (!(sd->flags & SD_SHARE_CPUPOWER))  		return 0; @@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	}  	/* Adjust by relative CPU power of the group */ -	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; +	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;  	/*  	 * Consider the group unbalanced when the imbalance is larger @@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)  		sgs->group_imb = 1; -	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); +	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, +						SCHED_POWER_SCALE);  	if (!sgs->group_capacity)  		sgs->group_capacity = fix_small_capacity(sd, group);  	sgs->group_weight = group->group_weight; @@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,  		return 0;  	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, -				       SCHED_LOAD_SCALE); +				       SCHED_POWER_SCALE);  	return 1;  } @@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  			cpu_avg_load_per_task(this_cpu);  	scaled_busy_load_per_task = sds->busiest_load_per_task -						 * SCHED_LOAD_SCALE; +					 * SCHED_POWER_SCALE;  	scaled_busy_load_per_task /= sds->busiest->cpu_power;  	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= @@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  			min(sds->busiest_load_per_task, sds->max_load);  	pwr_now += sds->this->cpu_power *  			min(sds->this_load_per_task, sds->this_load); -	pwr_now /= SCHED_LOAD_SCALE; +	pwr_now /= SCHED_POWER_SCALE;  	/* Amount of load we'd subtract */ -	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / +	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /  		sds->busiest->cpu_power;  	if (sds->max_load > tmp)  		pwr_move += sds->busiest->cpu_power * @@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,  	/* Amount of load we'd add */  	if (sds->max_load * sds->busiest->cpu_power < -		sds->busiest_load_per_task * SCHED_LOAD_SCALE) +		sds->busiest_load_per_task * SCHED_POWER_SCALE)  		tmp = (sds->max_load * sds->busiest->cpu_power) /  			sds->this->cpu_power;  	else -		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / +		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /  			sds->this->cpu_power;  	pwr_move += sds->this->cpu_power *  			min(sds->this_load_per_task, sds->this_load + tmp); -	pwr_move /= SCHED_LOAD_SCALE; +	pwr_move /= SCHED_POWER_SCALE;  	/* Move if we gain throughput */  	if (pwr_move > pwr_now) @@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,  		load_above_capacity = (sds->busiest_nr_running -  						sds->busiest_group_capacity); -		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); +		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);  		load_above_capacity /= sds->busiest->cpu_power;  	} @@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,  	/* How much load to actually move to equalise the imbalance */  	*imbalance = min(max_pull * sds->busiest->cpu_power,  		(sds->avg_load - sds->this_load) * sds->this->cpu_power) -			/ SCHED_LOAD_SCALE; +			/ SCHED_POWER_SCALE;  	/*  	 * if *imbalance is less than the average load per runnable task @@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  	if (!sds.busiest || sds.busiest_nr_running == 0)  		goto out_balanced; -	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; +	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;  	/*  	 * If the busiest group is imbalanced the below checks don't @@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,  	for_each_cpu(i, sched_group_cpus(group)) {  		unsigned long power = power_of(i); -		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); +		unsigned long capacity = DIV_ROUND_CLOSEST(power, +							   SCHED_POWER_SCALE);  		unsigned long wl;  		if (!capacity) @@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,  		 * the load can be moved away from the cpu that is potentially  		 * running at a lower capacity.  		 */ -		wl = (wl * SCHED_LOAD_SCALE) / power; +		wl = (wl * SCHED_POWER_SCALE) / power;  		if (wl > max_load) {  			max_load = wl; diff --git a/kernel/signal.c b/kernel/signal.c index ad5e818baac..86c32b884f8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3023,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)  SYSCALL_DEFINE0(pause)  { -	current->state = TASK_INTERRUPTIBLE; -	schedule(); +	while (!signal_pending(current)) { +		current->state = TASK_INTERRUPTIBLE; +		schedule(); +	}  	return -ERESTARTNOHAND;  } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b1..4fc92445a29 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,6 +56,7 @@  #include <linux/kprobes.h>  #include <linux/pipe_fs_i.h>  #include <linux/oom.h> +#include <linux/kmod.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {  		.child		= random_table,  	},  	{ +		.procname	= "usermodehelper", +		.mode		= 0555, +		.child		= usermodehelper_table, +	}, +	{  		.procname	= "overflowuid",  		.data		= &overflowuid,  		.maxlen		= sizeof(int), @@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = {  		.data           = &watchdog_enabled,  		.maxlen         = sizeof (int),  		.mode           = 0644, -		.proc_handler   = proc_dowatchdog_enabled, +		.proc_handler   = proc_dowatchdog, +		.extra1		= &zero, +		.extra2		= &one,  	},  	{  		.procname	= "watchdog_thresh", -		.data		= &softlockup_thresh, +		.data		= &watchdog_thresh,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dowatchdog_thresh, +		.proc_handler	= proc_dowatchdog,  		.extra1		= &neg_one,  		.extra2		= &sixty,  	}, @@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = {  		.data           = &watchdog_enabled,  		.maxlen         = sizeof (int),  		.mode           = 0644, -		.proc_handler   = proc_dowatchdog_enabled, +		.proc_handler   = proc_dowatchdog, +		.extra1		= &zero, +		.extra2		= &one,  	},  #endif  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) @@ -1496,7 +1506,7 @@ static struct ctl_table fs_table[] = {  static struct ctl_table debug_table[] = {  #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ -    defined(CONFIG_S390) +    defined(CONFIG_S390) || defined(CONFIG_TILE)  	{  		.procname	= "exception-trace",  		.data		= &show_unhandled_signals, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9265014cb4d..2d966244ea6 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -494,7 +494,7 @@ static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,   */  static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)  { -	enum  alarmtimer_type type = restart->nanosleep.index; +	enum  alarmtimer_type type = restart->nanosleep.clockid;  	ktime_t exp;  	struct timespec __user  *rmtp;  	struct alarm alarm; @@ -573,7 +573,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,  	restart = ¤t_thread_info()->restart_block;  	restart->fn = alarm_timer_nsleep_restart; -	restart->nanosleep.index = type; +	restart->nanosleep.clockid = type;  	restart->nanosleep.expires = exp.tv64;  	restart->nanosleep.rmtp = rmtp;  	ret = -ERESTART_RESTARTBLOCK; @@ -669,12 +669,20 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)   */  static int __init alarmtimer_init_late(void)  { +	struct device *dev;  	char *str;  	/* Find an rtc device and init the rtc_timer */ -	class_find_device(rtc_class, NULL, &str, has_wakealarm); -	if (str) +	dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); +	/* If we have a device then str is valid. See has_wakealarm() */ +	if (dev) {  		rtcdev = rtc_class_open(str); +		/* +		 * Drop the reference we got in class_find_device, +		 * rtc_open takes its own. +		 */ +		put_device(dev); +	}  	if (!rtcdev) {  		printk(KERN_WARNING "No RTC device found, ALARM timers will"  			" not wake from suspend"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 723c7637e55..c7218d13273 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)  	unsigned long flags;  	int cpu; -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); -  	/*  	 * Periodic mode does not care about the enter/exit of power  	 * states  	 */  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) -		goto out; +		return; -	bc = tick_broadcast_device.evtdev; +	/* +	 * We are called with preemtion disabled from the depth of the +	 * idle code, so we can't be moved away. +	 */  	cpu = smp_processor_id();  	td = &per_cpu(tick_cpu_device, cpu);  	dev = td->evtdev;  	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) -		goto out; +		return; + +	bc = tick_broadcast_device.evtdev; +	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {  		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {  			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); @@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)  				tick_program_event(dev->next_event, 1);  		}  	} - -out:  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8e6a05a5915..342408cf68d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -680,7 +680,7 @@ static void timekeeping_resume(void)  	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);  	/* Resume hrtimers */ -	hres_timers_resume(); +	hrtimers_resume();  }  static int timekeeping_suspend(void) @@ -1099,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,  }  /** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ +	unsigned long seq; +	struct timespec wtom; + +	do { +		seq = read_seqbegin(&xtime_lock); +		wtom = wall_to_monotonic; +	} while (read_seqretry(&xtime_lock, seq)); +	return timespec_to_ktime(wtom); +} + +/**   * xtime_update() - advances the timekeeping infrastructure   * @ticks:	number of ticks, that have elapsed since the last call.   * diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eab..bff131b9510 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,6 +15,7 @@  #include <linux/err.h>  #include <linux/slab.h>  #include <linux/user_namespace.h> +#include <linux/proc_fs.h>  static struct uts_namespace *create_uts_ns(void)  { @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)  	put_user_ns(ns->user_ns);  	kfree(ns);  } + +static void *utsns_get(struct task_struct *task) +{ +	struct uts_namespace *ns = NULL; +	struct nsproxy *nsproxy; + +	rcu_read_lock(); +	nsproxy = task_nsproxy(task); +	if (nsproxy) { +		ns = nsproxy->uts_ns; +		get_uts_ns(ns); +	} +	rcu_read_unlock(); + +	return ns; +} + +static void utsns_put(void *ns) +{ +	put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ +	get_uts_ns(ns); +	put_uts_ns(nsproxy->uts_ns); +	nsproxy->uts_ns = ns; +	return 0; +} + +const struct proc_ns_operations utsns_operations = { +	.name		= "uts", +	.type		= CLONE_NEWUTS, +	.get		= utsns_get, +	.put		= utsns_put, +	.install	= utsns_install, +}; + diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156..7daa4b072e9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@  #include <linux/perf_event.h>  int watchdog_enabled = 1; -int __read_mostly softlockup_thresh = 60; +int __read_mostly watchdog_thresh = 10;  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)  __setup("nosoftlockup", nosoftlockup_setup);  /*  */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh(void) +{ +	return watchdog_thresh * 2; +}  /*   * Returns seconds, approximately.  We don't need nanosecond @@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)  static unsigned long get_sample_period(void)  {  	/* -	 * convert softlockup_thresh from seconds to ns +	 * convert watchdog_thresh from seconds to ns  	 * the divide by 5 is to give hrtimer 5 chances to  	 * increment before the hardlockup detector generates  	 * a warning  	 */ -	return softlockup_thresh / 5 * NSEC_PER_SEC; +	return get_softlockup_thresh() * (NSEC_PER_SEC / 5);  }  /* Commands for resetting the watchdog */ @@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)  	unsigned long now = get_timestamp(smp_processor_id());  	/* Warn about unreasonable delays: */ -	if (time_after(now, touch_ts + softlockup_thresh)) +	if (time_after(now, touch_ts + get_softlockup_thresh()))  		return now - touch_ts;  	return 0; @@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)  	/* Try to register using hardware perf events */  	wd_attr = &wd_hw_attr; -	wd_attr->sample_period = hw_nmi_get_sample_period(); +	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);  	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);  	if (!IS_ERR(event)) {  		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); @@ -501,28 +512,25 @@ static void watchdog_disable_all_cpus(void)  /* sysctl functions */  #ifdef CONFIG_SYSCTL  /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh   */ -int proc_dowatchdog_enabled(struct ctl_table *table, int write, -		     void __user *buffer, size_t *length, loff_t *ppos) +int proc_dowatchdog(struct ctl_table *table, int write, +		    void __user *buffer, size_t *lenp, loff_t *ppos)  { -	proc_dointvec(table, write, buffer, length, ppos); +	int ret; -	if (write) { -		if (watchdog_enabled) -			watchdog_enable_all_cpus(); -		else -			watchdog_disable_all_cpus(); -	} -	return 0; -} +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); +	if (ret || !write) +		goto out; -int proc_dowatchdog_thresh(struct ctl_table *table, int write, -			     void __user *buffer, -			     size_t *lenp, loff_t *ppos) -{ -	return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +	if (watchdog_enabled && watchdog_thresh) +		watchdog_enable_all_cpus(); +	else +		watchdog_disable_all_cpus(); + +out: +	return ret;  }  #endif /* CONFIG_SYSCTL */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e3378e8d3a5..0400553f0d0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)  		}  	} -	/* just in case, make sure it's actually aligned -	 * - this is affected by PERCPU() alignment in vmlinux.lds.S -	 */ +	/* just in case, make sure it's actually aligned */  	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));  	return wq->cpu_wq.v ? 0 : -ENOMEM;  } | 
