diff options
Diffstat (limited to 'kernel')
57 files changed, 2570 insertions, 2543 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100ea..d2b32ac27a3 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd..91e53d04b6a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3..43c307dc945 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f8..a7c9e6ddb97 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); - if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && - root->flags != opts.flags) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto drop_new_super; + if (root->flags != opts.flags) { + if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } else { + pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + } } /* no subsys rebinding, so refcounts don't change */ @@ -2699,13 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, goto out; } + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } @@ -2953,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2965,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b87..383f8231e43 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include <linux/context_tracking.h> -#include <linux/kvm_host.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/hardirq.h> @@ -71,6 +70,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427..198a38883e6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void) core_initcall(alloc_frozen_cpus); /* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - -/* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038..e695c0a0bcb 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include <linux/cpu.h> #include <linux/tick.h> #include <linux/mm.h> +#include <linux/stackprotector.h> #include <asm/tlb.h> @@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* @@ -112,6 +114,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe..902d13fc2b1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c + * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. @@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a + * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c..1db3af93370 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' /* * max perf event sample rate */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +#define DEFAULT_MAX_SAMPLE_RATE 100000 +#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) +#define DEFAULT_CPU_TIME_MAX_PERCENT 25 + +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; + +static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; + +static atomic_t perf_sample_allowed_ns __read_mostly = + ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); + +void update_perf_cpu_limits(void) +{ + u64 tmp = perf_sample_period_ns; + + tmp *= sysctl_perf_cpu_time_max_percent; + tmp = do_div(tmp, 100); + atomic_set(&perf_sample_allowed_ns, tmp); +} + +static int perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + update_perf_cpu_limits(); return 0; } +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; + +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + update_perf_cpu_limits(); + + return 0; +} + +/* + * perf samples are done in some very critical code paths (NMIs). + * If they take too much CPU time, the system can lock up and not + * get any real work done. This will drop the sample rate when + * we detect that events are taking too long. + */ +#define NR_ACCUMULATED_SAMPLES 128 +DEFINE_PER_CPU(u64, running_sample_length); + +void perf_sample_event_took(u64 sample_len_ns) +{ + u64 avg_local_sample_len; + u64 local_samples_len = __get_cpu_var(running_sample_length); + + if (atomic_read(&perf_sample_allowed_ns) == 0) + return; + + /* decay the counter by 1 average sample */ + local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; + local_samples_len += sample_len_ns; + __get_cpu_var(running_sample_length) = local_samples_len; + + /* + * note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * from having to maintain a count. + */ + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + return; + + if (max_samples_per_tick <= 1) + return; + + max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); + sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + + printk_ratelimited(KERN_WARNING + "perf samples too long (%lld > %d), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, + atomic_read(&perf_sample_allowed_ns), + sysctl_perf_event_sample_rate); + + update_perf_cpu_limits(); +} + static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, @@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ + struct perf_cpu_context *cpuctx; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int rotations = 0; + + WARN_ON(!irqs_disabled()); + + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + + rotations = perf_rotate_context(cpuctx); + + /* + * arm timer if needed + */ + if (rotations) { + hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + ret = HRTIMER_RESTART; + } + + return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (WARN_ON(cpu != smp_processor_id())) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (pmu->task_ctx_nr == perf_sw_context) + continue; + + hrtimer_cancel(&cpuctx->hrtimer); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + int timer; + + /* no multiplexing needed for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + /* + * check default is sane, if not set then force to + * default interval (1/tick) + */ + timer = pmu->hrtimer_interval_ms; + if (timer < 1) + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* not for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + if (hrtimer_active(hr)) + return; + + if (!hrtimer_callback_running(hr)) + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, + 0, HRTIMER_MODE_REL_PINNED, 0); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1552,6 +1736,8 @@ group_error: pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); + return -EAGAIN; } @@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info) * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != event) + if (leader != event) { group_sched_out(leader, cpuctx, ctx); + perf_cpu_hrtimer_restart(cpuctx); + } if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_EVENT_STATE_ERROR; @@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx) * because they're strictly cpu affine and rotate_start is called with IRQs * disabled, while rotate_context is called from IRQ context. */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0, remove = 1; @@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) done: if (remove) list_del_init(&cpuctx->rotation_list); + + return rotate; } #ifdef CONFIG_NO_HZ_FULL @@ -2625,10 +2815,6 @@ void perf_event_task_tick(void) ctx = cpuctx->task_ctx; if (ctx) perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); } } @@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event) if (has_branch_stack(event)) { static_key_slow_dec_deferred(&perf_sched_events); /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) + if (!(event- |