diff options
Diffstat (limited to 'kernel')
54 files changed, 2859 insertions, 2274 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index a987aa1676b..149e18ef1ab 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o -obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o diff --git a/kernel/capability.c b/kernel/capability.c index 9e4697e9b27..2f05303715a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -15,7 +15,6 @@ #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <asm/uaccess.h> -#include "cred-internals.h" /* * Leveraged for setting/resetting capabilities diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6d870f2d122..e9ec642932e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3016,7 +3016,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - remove_wait_queue_locked(event->wqh, &event->wait); + __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); list_del(&event->list); spin_unlock(&cgrp->event_list_lock); diff --git a/kernel/cpu.c b/kernel/cpu.c index 25bba73b1be..54577757477 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { + struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -172,6 +173,7 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; + unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param) raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, param->hcpu); + if (task_cpu(param->caller) == cpu) + move_task_off_dead_cpu(cpu, param->caller); /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { + .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; - if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) - return -ENOMEM; - cpu_hotplug_begin(); set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, @@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } - /* Ensure that we are not runnable on dying cpu */ - cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpu_active_mask); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { set_cpu_active(cpu, true); @@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) hcpu) == NOTIFY_BAD) BUG(); - goto out_allowed; + goto out_release; } BUG_ON(cpu_online(cpu)); @@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_allowed: - set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -264,7 +259,6 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } - free_cpumask_var(old_allowed); return err; } @@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu) { int err; - err = stop_machine_create(); - if (err) - return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); - stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); @@ -367,9 +357,6 @@ int disable_nonboot_cpus(void) { int cpu, first_cpu, error; - error = stop_machine_create(); - if (error) - return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* @@ -400,7 +387,6 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); - stop_machine_destroy(); return error; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d10946748ec..9a50c5f6e72 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); - cpuset_cpus_allowed_locked(tsk, pmask); + task_lock(tsk); + guarantee_online_cpus(task_cs(tsk), pmask); + task_unlock(tsk); mutex_unlock(&callback_mutex); } -/** - * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. - * Must be called with callback_mutex held. - **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +int cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); - task_unlock(tsk); + const struct cpuset *cs; + int cpu; + + rcu_read_lock(); + cs = task_cs(tsk); + if (cs) + cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); + rcu_read_unlock(); + + /* + * We own tsk->cpus_allowed, nobody can change it under us. + * + * But we used cs && cs->cpus_allowed lockless and thus can + * race with cgroup_attach_task() or update_cpumask() and get + * the wrong tsk->cpus_allowed. However, both cases imply the + * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() + * which takes task_rq_lock(). + * + * If we are called after it dropped the lock we must see all + * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary + * set any mask even if it is not right from task_cs() pov, + * the pending set_cpus_allowed_ptr() will fix things. + */ + + cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); + if (cpu >= nr_cpu_ids) { + /* + * Either tsk->cpus_allowed is wrong (see above) or it + * is actually empty. The latter case is only possible + * if we are racing with remove_tasks_in_empty_cpuset(). + * Like above we can temporary set any mask and rely on + * set_cpus_allowed_ptr() as synchronization point. + */ + cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); + cpu = cpumask_any(cpu_active_mask); + } + + return cpu; } void cpuset_init_current_mems_allowed(void) @@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) } /** - * cpuset_lock - lock out any changes to cpuset structures - * - * The out of memory (oom) code needs to mutex_lock cpusets - * from being changed while it scans the tasklist looking for a - * task in an overlapping cpuset. Expose callback_mutex via this - * cpuset_lock() routine, so the oom code can lock it, before - * locking the task list. The tasklist_lock is a spinlock, so - * must be taken inside callback_mutex. - */ - -void cpuset_lock(void) -{ - mutex_lock(&callback_mutex); -} - -/** * cpuset_unlock - release lock on cpuset changes * * Undo the lock taken in a previous cpuset_lock() call. diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h deleted file mode 100644 index 2dc4fc2d0bf..00000000000 --- a/kernel/cred-internals.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Internal credentials stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -/* - * user.c - */ -static inline void sched_switch_user(struct task_struct *p) -{ -#ifdef CONFIG_USER_SCHED - sched_move_task(p); -#endif /* CONFIG_USER_SCHED */ -} - diff --git a/kernel/cred.c b/kernel/cred.c index 62af1816c23..8f3672a58a1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -17,7 +17,6 @@ #include <linux/init_task.h> #include <linux/security.h> #include <linux/cn_proc.h> -#include "cred-internals.h" #if 0 #define kdebug(FMT, ...) \ @@ -560,8 +559,6 @@ int commit_creds(struct cred *new) atomic_dec(&old->user->processes); alter_cred_subscribers(old, -2); - sched_switch_user(task); - /* send notifications */ if (new->uid != old->uid || new->euid != old->euid || diff --git a/kernel/exit.c b/kernel/exit.c index 7f2683a10ac..eabca5a73a8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,7 +55,6 @@ #include <asm/unistd.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> -#include "cred-internals.h" static void exit_mm(struct task_struct * tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 4c14942a0ee..4d57d9e3a6e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1112,8 +1112,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.memcg = NULL; #endif - p->bts = NULL; - /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03808ed342a..7a56b22e060 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,23 +40,29 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/hw_breakpoint.h> + /* * Constraints data */ /* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); /* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -67,16 +73,29 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + /* * Report the maximum number of pinned breakpoints a task * have in this cpu */ -static unsigned int max_task_bp_pinned(int cpu) +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - for (i = HBP_NUM -1; i >= 0; i--) { + for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } -static int task_bp_pinned(struct task_struct *tsk) +static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) { struct perf_event_context *ctx = tsk->perf_event_ctxp; struct list_head *list; @@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk) */ list_for_each_entry(bp, list, event_entry) { if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; + if (find_slot_idx(bp) == type) + count += hw_breakpoint_weight(bp); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk) * a given cpu (cpu > -1) or in all of them (cpu = -1). */ static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - slots->pinned += max_task_bp_pinned(cpu); + slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk); - slots->flexible = per_cpu(nr_bp_flexible, cpu); + slots->pinned += task_bp_pinned(tsk, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; } @@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) for_each_online_cpu(cpu) { unsigned int nr; - nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - nr += max_task_bp_pinned(cpu); + nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk); + nr += task_bp_pinned(tsk, type); if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible, cpu); + nr = per_cpu(nr_bp_flexible[type], cpu); if (nr > slots->flexible) slots->flexible = nr; @@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) } /* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, + enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; - int count = 0; + int old_count = 0; + int old_idx = 0; + int idx = 0; - count = task_bp_pinned(tsk); + old_count = task_bp_pinned(tsk, type); + old_idx = old_count - 1; + idx = old_idx + weight; - tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { - tsk_pinned[count]++; - if (count > 0) - tsk_pinned[count-1]--; + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; } else { - tsk_pinned[count]--; - if (count > 0) - tsk_pinned[count-1]++; + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; } } /* * Add/remove the given breakpoint in our constraint table */ -static void toggle_bp_slot(struct perf_event *bp, bool enable) +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; @@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) /* Pinned counter task profiling */ if (tsk) { if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; else - per_cpu(nr_cpu_bp_pinned, bp->cpu)--; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; } /* @@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; - fetch_bp_busy_slots(&slots, bp); + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; - toggle_bp_slot(bp, true); + toggle_bp_slot(bp, true, type, weight); return 0; } @@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp) static void __release_bp_slot(struct perf_event *bp) { - toggle_bp_slot(bp, false); + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); } void release_bp_slot(struct perf_event *bp) @@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (ret) return ret; - /* - * Ptrace breakpoints can be temporary perf events only - * meant to reserve a slot. In this case, it is created disabled and - * we don't want to check the params right now (as we put a null addr) - * But perf tools create events as disabled and we want to check - * the params for them. - * This is a quick hack that will be removed soon, once we remove - * the tmp breakpoints from ptrace - */ - if (!bp->attr.disabled || !bp->overflow_handler) - ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + ret = validate_hw_breakpoint(bp); /* if arch_validate_hwbkpt_settings() fails then release bp slot */ if (ret) @@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (attr->disabled) goto end; - err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + err = validate_hw_breakpoint(bp); if (!err) perf_event_enable(bp); @@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { static int __init init_hw_breakpoint(void) { + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &pe |