diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-07-06 14:23:39 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-06 14:23:39 +0200 |
commit | 68083e05d72d94f347293d8cc0067050ba904bfa (patch) | |
tree | 842e71365bd90866be7add181661a4039d891564 /kernel | |
parent | 7baac8b91f9871ba8cb09af84de4ae1d86d07812 (diff) | |
parent | b7279469d66b55119784b8b9529c99c1955fe747 (diff) |
Merge commit 'v2.6.26-rc9' into cpus4096
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 6 | ||||
-rw-r--r-- | kernel/auditfilter.c | 3 | ||||
-rw-r--r-- | kernel/capability.c | 132 | ||||
-rw-r--r-- | kernel/cgroup.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 20 | ||||
-rw-r--r-- | kernel/exit.c | 7 | ||||
-rw-r--r-- | kernel/futex.c | 93 | ||||
-rw-r--r-- | kernel/hrtimer.c | 8 | ||||
-rw-r--r-- | kernel/kgdb.c | 19 | ||||
-rw-r--r-- | kernel/kprobes.c | 15 | ||||
-rw-r--r-- | kernel/module.c | 18 | ||||
-rw-r--r-- | kernel/rcuclassic.c | 16 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 2 | ||||
-rw-r--r-- | kernel/relay.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 521 | ||||
-rw-r--r-- | kernel/sched_clock.c | 18 | ||||
-rw-r--r-- | kernel/sched_debug.c | 5 | ||||
-rw-r--r-- | kernel/sched_fair.c | 254 | ||||
-rw-r--r-- | kernel/sched_rt.c | 70 | ||||
-rw-r--r-- | kernel/sched_stats.h | 7 | ||||
-rw-r--r-- | kernel/signal.c | 51 | ||||
-rw-r--r-- | kernel/softlockup.c | 16 | ||||
-rw-r--r-- | kernel/stop_machine.c | 7 | ||||
-rw-r--r-- | kernel/sys.c | 6 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
25 files changed, 559 insertions, 741 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e8692a5748c..e092f1c0ce3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb), msg_type); + err = audit_filter_user(&NETLINK_CB(skb)); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { @@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0e0bd27e651..98c50cc671b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @data: payload data * @datasz: size of payload data * @loginuid: loginuid of sender + * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, @@ -1720,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb, int type) +int audit_filter_user(struct netlink_skb_parms *cb) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; diff --git a/kernel/capability.c b/kernel/capability.c index 39e8193b41e..901e0fdc3ff 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -53,11 +53,95 @@ static void warn_legacy_capability_use(void) } /* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + +/* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is * uninteresting and/or not to be changed. */ +/* + * Atomically modify the effective capabilities returning the original + * value. No permission check is performed here - it is assumed that the + * caller is permitted to set the desired effective capabilities. + */ +kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) +{ + kernel_cap_t pE_old; + + spin_lock(&task_capability_lock); + + pE_old = current->cap_effective; + current->cap_effective = pE_new; + + spin_unlock(&task_capability_lock); + + return pE_old; +} + +EXPORT_SYMBOL(cap_set_effective); + /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and @@ -71,27 +155,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - __u32 version; struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -118,7 +188,7 @@ out: spin_unlock(&task_capability_lock); if (!ret) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { @@ -128,7 +198,7 @@ out: } /* - * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability @@ -240,30 +310,16 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - __u32 version; struct task_struct *target; int ret; pid_t pid; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -281,7 +337,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } - while (i < _LINUX_CAPABILITY_U32S) { + while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fbc6fc8949b..15ac0e1e4f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); + snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e32..9fceb97e989 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -797,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) return retval; + + if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + return -EINVAL; } - cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); retval = validate_change(cs, &trialcs); if (retval < 0) return retval; @@ -932,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; + + if (!nodes_subset(trialcs.mems_allowed, + node_states[N_HIGH_MEMORY])) + return -EINVAL; } - nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, - node_states[N_HIGH_MEMORY]); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ @@ -1033,8 +1037,8 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { - if ((int)val < 0) - val = -1; + if (val < -1 || val >= SD_LV_MAX) + return -EINVAL; if (val != cs->relax_domain_level) { cs->relax_domain_level = val; @@ -1886,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/exit.c b/kernel/exit.c index 1510f78a0ff..8f6185e69b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -126,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk); + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -133,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk) __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074f..7d1136e97c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..ab80515008f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1003,10 +1003,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) */ raise = timer->state == HRTIMER_STATE_PENDING; + /* + * We use preempt_disable to prevent this task from migrating after + * setting up the softirq and raising it. Otherwise, if me migrate + * we will raise the softirq on the wrong CPU. + */ + preempt_disable(); + unlock_hrtimer_base(timer, &flags); if (raise) hrtimer_raise_softirq(); + preempt_enable(); return ret; } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 14787de568b..3ec23c3ec97 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -52,6 +52,7 @@ #include <asm/byteorder.h> #include <asm/atomic.h> #include <asm/system.h> +#include <asm/unaligned.h> static int kgdb_break_asap; @@ -227,8 +228,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs) * GDB remote protocol parser: */ -static const char hexchars[] = "0123456789abcdef"; - static int hex(char ch) { if ((ch >= 'a') && (ch <= 'f')) @@ -316,8 +315,8 @@ static void put_packet(char *buffer) } kgdb_io_ops->write_char('#'); - kgdb_io_ops->write_char(hexchars[checksum >> 4]); - kgdb_io_ops->write_char(hexchars[checksum & 0xf]); + kgdb_io_ops->write_char(hex_asc_hi(checksum)); + kgdb_io_ops->write_char(hex_asc_lo(checksum)); if (kgdb_io_ops->flush) kgdb_io_ops->flush(); @@ -478,8 +477,8 @@ static void error_packet(char *pkt, int error) { error = -error; pkt[0] = 'E'; - pkt[1] = hexchars[(error / 10)]; - pkt[2] = hexchars[(error % 10)]; + pkt[1] = hex_asc[(error / 10)]; + pkt[2] = hex_asc[(error % 10)]; pkt[3] = '\0'; } @@ -510,10 +509,7 @@ static void int_to_threadref(unsigned char *id, int value) scan = (unsigned char *)id; while (i--) *scan++ = 0; - *scan++ = (value >> 24) & 0xff; - *scan++ = (value >> 16) & 0xff; - *scan++ = (value >> 8) & 0xff; - *scan++ = (value & 0xff); + put_unaligned_be32(value, scan); } static struct task_struct *getthread(struct pt_regs *regs, int tid) @@ -1503,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) { unsigned long flags; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1e0250cb948..d4998f81e22 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -699,8 +699,9 @@ static int __register_kprobes(struct kprobe **kps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kprobe(kps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kprobes(kps, i); + if (ret < 0) { + if (i > 0) + unregister_kprobes(kps, i); break; } } @@ -776,8 +777,9 @@ static int __register_jprobes(struct jprobe **jps, int num, jp->kp.break_handler = longjmp_break_handler; ret = __register_kprobe(&jp->kp, called_from); } - if (ret < 0 && i > 0) { - unregister_jprobes(jps, i); + if (ret < 0) { + if (i > 0) + unregister_jprobes(jps, i); break; } } @@ -920,8 +922,9 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kretprobe(rps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kretprobes(rps, i); + if (ret < 0) { + if (i > 0) + unregister_kretprobes(rps, i); break; } } diff --git a/kernel/module.c b/kernel/module.c index f5e9491ef7a..5f80478b746 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1337,7 +1337,19 @@ out_unreg: kobject_put(&mod->mkobj.kobj); return err; } -#endif + +static void mod_sysfs_fini(struct module *mod) +{ + kobject_put(&mod->mkobj.kobj); +} + +#else /* CONFIG_SYSFS */ + +static void mod_sysfs_fini(struct module *mod) +{ +} + +#endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { @@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod) module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); - kobject_put(&mod->mkobj.kobj); + mod_sysfs_fini(mod); } /* @@ -1780,7 +1792,7 @@ static struct module *load_module(void __user *umod, /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) || hdr->e_shentsize != sizeof(*sechdrs)) { diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 251358de70b..adde10388d0 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp, /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. + * + * cpu_online_map is updated by the _cpu_down() + * using stop_machine_run(). Since we're in irqs disabled + * section, stop_machine_run() is not exectuting, hence + * the cpu_online_map is stable. + * + * However, a cpu might have been offlined _just_ before + * we disabled irqs while entering here. + * And rcu subsystem might not yet have handled the CPU_DEAD + * notification, leading to the offlined cpu's bit + * being set in the rcp->cpumask. + * + * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * sending smp_reschedule() to an offlined CPU. */ - cpumask = rcp->cpumask; + cpus_and(cpumask, rcp->cpumask, cpu_online_map); cpu_clear(rdp->cpu, cpumask); for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 18af270125c..5cbd69edf5d 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; diff --git a/kernel/relay.c b/kernel/relay.c index bc24dcdc570..7de644cdec4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); if (ret < 0) break; diff --git a/kernel/sched.c b/kernel/sched.c index 814d6e17f1e..e6795e39c8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -136,7 +136,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0; } @@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock); #endif /* - * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (ULONG_MAX - 1) +#define MAX_SHARES (1UL << 18) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -398,43 +401,6 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; - /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. - */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; - - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; - - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; - } aggregate; -#endif #endif }; @@ -1161,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1216,6 +1183,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { @@ -1368,17 +1336,19 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) -/* - * delta *= weight / lw - */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; - if (!lw->inv_weight) - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* @@ -1393,6 +1363,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } +static inline unsigned long +calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) +{ + return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); +} + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -1505,326 +1481,6 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) -{ - return &tg->cfs_rq[sd->first_cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) -{ - struct task_group *parent, *child; - - rcu_read_lock(); - parent = &root_task_group; -down: - (*down)(parent, sd); - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; |