diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 67 | ||||
-rw-r--r-- | kernel/audit.h | 5 | ||||
-rw-r--r-- | kernel/audit_tree.c | 9 | ||||
-rw-r--r-- | kernel/audit_watch.c | 4 | ||||
-rw-r--r-- | kernel/auditfilter.c | 12 | ||||
-rw-r--r-- | kernel/auditsc.c | 16 | ||||
-rw-r--r-- | kernel/cgroup.c | 11 | ||||
-rw-r--r-- | kernel/cpuset.c | 13 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 16 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_main.c | 48 | ||||
-rw-r--r-- | kernel/exit.c | 8 | ||||
-rw-r--r-- | kernel/irq/manage.c | 4 | ||||
-rw-r--r-- | kernel/latencytop.c | 17 | ||||
-rw-r--r-- | kernel/module.c | 12 | ||||
-rw-r--r-- | kernel/perf_event.c | 42 | ||||
-rw-r--r-- | kernel/printk.c | 21 | ||||
-rw-r--r-- | kernel/range.c | 2 | ||||
-rw-r--r-- | kernel/relay.c | 15 | ||||
-rw-r--r-- | kernel/resource.c | 151 | ||||
-rw-r--r-- | kernel/sched.c | 8 | ||||
-rw-r--r-- | kernel/sched_fair.c | 25 | ||||
-rw-r--r-- | kernel/sched_stats.h | 20 | ||||
-rw-r--r-- | kernel/sysctl.c | 9 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace.c | 19 | ||||
-rw-r--r-- | kernel/watchdog.c | 2 |
26 files changed, 373 insertions, 187 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b5..77770a034d5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) struct task_struct *tsk; int err; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - err = -ESRCH; - if (!tsk) - goto out; - err = 0; - - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->audit_tty) - err = -EPERM; - spin_unlock_irq(&tsk->sighand->siglock); - if (err) - goto out; - - tty_audit_push_task(tsk, loginuid, sessionid); -out: - read_unlock(&tasklist_lock); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + err = tty_audit_push_task(tsk, loginuid, sessionid); + put_task_struct(tsk); return err; } @@ -506,7 +499,7 @@ int audit_send_list(void *_dest) } struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, void *payload, int size) + int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; @@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the pid. * No failure notifications. */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size) +static void audit_send_reply(int pid, int seq, int type, int done, int multi, + const void *payload, int size) { struct sk_buff *skb; struct task_struct *tsk; @@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk; + unsigned long flags; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); break; } default: diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13..91e7071c4d2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size); -extern void audit_send_reply(int pid, int seq, int type, - int done, int multi, - void *payload, int size); + const void *payload, int size); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527..37b2bea170c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -223,7 +223,7 @@ static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new; + struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; int i, j; @@ -232,9 +232,14 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); + if (size) + new = alloc_chunk(size); + spin_lock(&entry->lock); if (chunk->dead || !entry->i.inode) { spin_unlock(&entry->lock); + if (new) + free_chunk(new); goto out; } @@ -255,9 +260,9 @@ static void untag_chunk(struct node *p) goto out; } - new = alloc_chunk(size); if (!new) goto Fallback; + fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542..d2e3c786646 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -60,7 +60,7 @@ struct audit_parent { }; /* fsnotify handle. */ -struct fsnotify_group *audit_watch_group; +static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ @@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch) } } -void audit_remove_watch(struct audit_watch *watch) +static void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); audit_put_parent(watch->parent); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb..add2819af71 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, case AUDIT_LOGINUID: result = audit_comparator(cb->loginuid, f->op, f->val); break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) + result = security_audit_rule_match(cb->sid, + f->type, + f->op, + f->lsm_rule, + NULL); + break; } if (!result) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d03..f49a0318c2e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -241,6 +241,10 @@ struct audit_context { pid_t pid; struct audit_cap_data cap; } capset; + struct { + int fd; + int flags; + } mmap; }; int fds[2]; @@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); break; } + case AUDIT_MMAP: { + audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, + context->mmap.flags); + break; } } audit_log_end(ab); } @@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid, context->type = AUDIT_CAPSET; } +void __audit_mmap_fd(int fd, int flags) +{ + struct audit_context *context = current->audit_context; + context->mmap.fd = fd; + context->mmap.flags = flags; + context->type = AUDIT_MMAP; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cf366965d0..66a416b42c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1460,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct cgroup_sb_opts opts; struct cgroupfs_root *root; @@ -1596,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_parsed_module_refcounts(opts.subsys_bits); } - simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - return 0; + return dget(sb->s_root); drop_new_super: deactivate_locked_super(sb); @@ -1608,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - return ret; + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) { @@ -1658,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) { static struct file_system_type cgroup_fs_type = { .name = "cgroup", - .get_sb = cgroup_get_sb, + .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51b143e2a07..4349935c2ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static int cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) +static struct dentry *cpuset_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, void *data) { struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - int ret = -ENODEV; + struct dentry *ret = ERR_PTR(-ENODEV); if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->get_sb(cgroup_fs, flags, - unused_dev_name, mountopts, mnt); + ret = cgroup_fs->mount(cgroup_fs, flags, + unused_dev_name, mountopts); put_filesystem(cgroup_fs); } return ret; @@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type, static struct file_system_type cpuset_fs_type = { .name = "cpuset", - .get_sb = cpuset_get_sb, + .mount = cpuset_mount, }; /* diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index fec596da9bd..cefd4a11f6d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) return 0; } -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - /* * Some architectures need cache flushes when we set/clear a * breakpoint: @@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, atomic_inc(&masters_in_kgdb); else atomic_inc(&slaves_in_kgdb); - kgdb_disable_hw_debug(ks->linux_regs); + + if (arch_kgdb_ops.disable_hw_break) + arch_kgdb_ops.disable_hw_break(regs); acquirelock: /* diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d7bda21a106..37755d62192 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, /* special case below */ } else { kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", - kdb_current, kdb_current->pid); + kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); #endif @@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv) */ static int kdb_per_cpu(int argc, const char **argv) { - char buf[256], fmtstr[64]; - kdb_symtab_t symtab; - cpumask_t suppress = CPU_MASK_NONE; - int cpu, diag; - unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; + char fmtstr[64]; + int cpu, diag, nextarg = 1; + unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; if (argc < 1 || argc > 3) return KDB_ARGCOUNT; - snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); - if (!kdbgetsymval(buf, &symtab)) { - kdb_printf("%s is not a per_cpu variable\n", argv[1]); - return KDB_BADADDR; - } + diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); + if (diag) + return diag; + if (argc >= 2) { diag = kdbgetularg(argv[2], &bytesperword); if (diag) @@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv) #define KDB_PCU(cpu) 0 #endif #endif - for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (whichcpu != ~0UL && whichcpu != cpu) continue; - addr = symtab.sym_start + KDB_PCU(cpu); + addr = symaddr + KDB_PCU(cpu); diag = kdb_getword(&val, addr, bytesperword); if (diag) { kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " "read, diag=%d\n", cpu, addr, diag); continue; } -#ifdef CONFIG_SMP - if (!val) { - cpu_set(cpu, suppress); - continue; - } -#endif /* CONFIG_SMP */ kdb_printf("%5d ", cpu); kdb_md_line(fmtstr, addr, bytesperword == KDB_WORD_SIZE, 1, bytesperword, 1, 1, 0); } - if (cpus_weight(suppress) == 0) - return 0; - kdb_printf("Zero suppressed cpu(s):"); - for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); - cpu = next_cpu(cpu, suppress)) { - kdb_printf(" %d", cpu); - if (cpu == num_possible_cpus() - 1 || - next_cpu(cpu, suppress) != cpu + 1) - continue; - while (cpu < num_possible_cpus() && - next_cpu(cpu, suppress) == cpu + 1) - ++cpu; - kdb_printf("-%d", cpu); - } - kdb_printf("\n"); - #undef KDB_PCU - return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..21aa7b3001f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -96,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk) sig->tty = NULL; } else { /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + + /* * If there is any task waiting for the group exit * then notify it: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa36..5f92acc5f95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,6 +324,10 @@ void enable_irq(unsigned int irq) if (!desc) return; + if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..17110a4a4fc 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: diff --git a/kernel/module.c b/kernel/module.c index 437a74a7524..d190664f25f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * mod->num_trace_events, GFP_KERNEL); #endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_bprintk_fmt_start, + sizeof(*mod->trace_bprintk_fmt_start) * + mod->num_trace_bprintk_fmt, GFP_KERNEL); +#endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, "__mcount_loc", diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 05b7d8c72c6..f818d9d2dc9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -675,6 +675,8 @@ event_sched_in(struct perf_event *event, event->tstamp_running += ctx->time - event->tstamp_stopped; + event->shadow_ctx_time = ctx->time - ctx->timestamp; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -3397,7 +3399,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[4]; @@ -3405,11 +3408,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + + values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + + values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) @@ -3422,7 +3425,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; @@ -3432,10 +3436,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; + values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; + values[n++] = running; if (leader != event) leader->pmu->read(leader); @@ -3460,13 +3464,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, } } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event); + perf_output_read_group(handle, event, enabled, running); else - perf_output_read_one(handle, event); + perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c37..9a2264fc42c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) char c; int error = 0; - error = security_syslog(type, from_file); + /* + * If this is from /proc/kmsg we only do the capabilities checks + * at open time. + */ + if (type == SYSLOG_ACTION_OPEN || !from_file) { + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + error = security_syslog(type); if (error) return error; diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) int clean_sort_range(struct range *range, int az) { - int i, j, k = az - 1, nr_range = 0; + int i, j, k = az - 1, nr_range = az; for (i = 0; i < k; i++) { if (range[i].end) diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); } /* diff --git a/kernel/resource.c b/kernel/resource.c index 9c9841cb690..9fad33efd0d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); +/* + * By default, we allocate free space bottom-up. The architecture can request + * top-down by clearing this flag. The user can override the architecture's + * choice with the "resource_alloc_from_bottom" kernel boot option, but that + * should only be a debugging tool. + */ +int resource_alloc_from_bottom = 1; + +static __init int setup_alloc_from_bottom(char *s) +{ + printk(KERN_INFO + "resource: allocating from bottom-up; please report a bug\n"); + resource_alloc_from_bottom = 1; + return 0; +} +early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn) return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +static resource_size_t simple_align_resource(void *data, + const struct resource *avail, + resource_size_t size, + resource_size_t align) +{ + return avail->start; +} + +static void resource_clip(struct resource *res, resource_size_t min, + resource_size_t max) +{ + if (res->start < min) + res->start = min; + if (res->end > max) + res->end = max; +} + +static bool resource_contains(struct resource *res1, struct resource *res2) +{ + return res1->start <= res2->start && res1->end >= res2->end; +} + +/* + * Find the resource before "child" in the sibling list of "root" children. + */ +static struct resource *find_sibling_prev(struct resource *root, struct resource *child) +{ + struct resource *this; + + for (this = root->child; this; this = this->sibling) + if (this->sibling == child) + return this; + + return NULL; +} + /* * Find empty slot in the resource tree given range and alignment. + * This version allocates from the end of the root resource first. + */ +static int find_resource_from_top(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), + void *alignf_data) +{ + struct resource *this; + struct resource tmp, avail, alloc; + + tmp.start = root->end; + tmp.end = root->end; + + this = find_sibling_prev(root, NULL); + for (;;) { + if (this) { + if (this->end < root->end) + tmp.start = this->end + 1; + } else + tmp.start = root->start; + + resource_clip(&tmp, min, max); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } + } + + if (!this || this->start == root->start) + break; + + tmp.end = this->start - 1; + this = find_sibling_prev(root, this); + } + return -EBUSY; +} + +/* + * Find empty slot in the resource tree given range and alignment. + * This version allocates from the beginning of the root resource first. */ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new; + struct resource tmp = *new, avail, alloc; tmp.start = root->start; /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to tmp->end below would cause an underflow. + * Skip past an allocated resource that starts at 0, since the + * assignment of this->start - 1 to tmp->end below would cause an + * underflow. */ if (this && this->start == 0) { tmp.start = this->end + 1; this = this->sibling; } - for(;;) { + for (;;) { if (this) tmp.end = this->start - 1; else tmp.end = root->end; - if (tmp.start < min) - tmp.start = min; - if (tmp.end > max) - tmp.end = max; - tmp.start = ALIGN(tmp.start, align); - if (alignf) - tmp.start = alignf(alignf_data, &tmp, size, align); - if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { - new->start = tmp.start; - new->end = tmp.start + size - 1; - return 0; + + resource_clip(&tmp, min, max); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } } + if (!this) break; + tmp.start = this->end + 1; this = this->sibling; } @@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new, { int err; + if (!alignf) + alignf = simple_align_resource; + write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (resource_alloc_from_bottom) + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + else + err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdf..aa14a56f9d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 933f3d1b62e..f4f6a8326dd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6f..48ddf431db0 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) } /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a cpu, we call this routine * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) } /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799..b65bf634035 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -704,6 +704,15 @@ static struct ctl_table kern_table[] = { }, #endif { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "ngroups_max", .data = &ngroups_max, .maxlen = sizeof (int), diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82d9b8106cd..ee6a7339cf0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1284,6 +1284,8 @@ void trace_dump_stack(void) __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); } +static DEFINE_PER_CPU(int, user_stack_count); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (unlikely(in_nmi())) return; + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + __this_cpu_dec(user_stack_count); + + out: + preempt_enable(); } #ifdef UNUSED diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..6e3c41a4024 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __initdata no_watchdog; +static int no_watchdog; /* boot commands */ |