diff options
Diffstat (limited to 'kernel')
61 files changed, 3365 insertions, 2576 deletions
diff --git a/kernel/async.c b/kernel/async.c index bd0c168a3bb..9d311838485 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); +static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); +static DEFINE_MUTEX(async_register_mutex); struct async_entry { struct list_head list; @@ -71,7 +73,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,13 +84,12 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); + if (running->registered && --running->count == 0) + list_del_init(&running->node); /* 4) free the entry */ kfree(entry); @@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->list, &async_pending); + if (running->registered && running->count++ == 0) + list_add_tail(&running->node, &async_domains); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { + mutex_lock(&async_register_mutex); do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); + struct async_domain *domain = NULL; + + spin_lock_irq(&async_lock); + if (!list_empty(&async_domains)) + domain = list_first_entry(&async_domains, typeof(*domain), node); + spin_unlock_irq(&async_lock); + + async_synchronize_cookie_domain(next_cookie, domain); + } while (!list_empty(&async_domains)); + mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_synchronize_full); /** + * async_unregister_domain - ensure no more anonymous waiters on this domain + * @domain: idle domain to flush out of any async_synchronize_full instances + * + * async_synchronize_{cookie|full}_domain() are not flushed since callers + * of these routines should know the lifetime of @domain + * + * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing + */ +void async_unregister_domain(struct async_domain *domain) +{ + mutex_lock(&async_register_mutex); + spin_lock_irq(&async_lock); + WARN_ON(!domain->registered || !list_empty(&domain->node) || + !list_empty(&domain->domain)); + domain->registered = 0; + spin_unlock_irq(&async_lock); + mutex_unlock(&async_register_mutex); +} +EXPORT_SYMBOL_GPL(async_unregister_domain); + +/** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; + if (!running) + return; + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); diff --git a/kernel/audit.c b/kernel/audit.c index fda8bd9e1d3..ea3b7b6191c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb) static void audit_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - char *data = NLMSG_DATA(nlh); + char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE) { if (printk_ratelimit()) @@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, if (!skb) return NULL; - nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); - data = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, pid, seq, t, size, flags); + if (!nlh) + goto out_kfree_skb; + data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_NEW */ - if (skb) - kfree_skb(skb); +out_kfree_skb: + kfree_skb(skb); return NULL; } @@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); + data = nlmsg_data(nlh); switch (msg_type) { case AUDIT_GET: @@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else @@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) - goto nlmsg_failure; + goto err; - nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); + if (!nlh) + goto out_kfree_skb; return ab; -nlmsg_failure: /* Used by NLMSG_NEW */ +out_kfree_skb: kfree_skb(ab->skb); ab->skb = NULL; err: diff --git a/kernel/cgroup.c b/kernel/cgroup.c index af2b5641fc8..79818507e44 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) dget(d); d_delete(d); - simple_unlink(d->d_inode, d); + simple_unlink(cgrp->dentry->d_inode, d); list_del_init(&cfe->node); dput(d); @@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); - mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); - mutex_unlock(&ss->hierarchy_mutex); /* subsystem is now free - drop reference on module */ module_put(ss->module); } else if (bit & final_bits) { @@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } -static void cgroup_lock_hierarchy(struct cgroupfs_root *root) -{ - /* We need to take each hierarchy_mutex in a consistent order */ - int i; - - /* - * No worry about a race with rebind_subsystems that might mess up the - * locking order, since both parties are under cgroup_mutex. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_lock(&ss->hierarchy_mutex); - } -} - -static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) -{ - int i; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_unlock(&ss->hierarchy_mutex); - } -} - /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_clone(cgrp); } - cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); - cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); - cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: @@ -4245,10 +4206,8 @@ again: list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ list_del_init(&cgrp->sibling); - cgroup_unlock_hierarchy(cgrp->root); list_del_init(&cgrp->allcg_node); @@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* this function shouldn't be used with modular subsystems, since they @@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* success! */ diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19..14d32588ccc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd1..f33c7153b6d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + +/* + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_upda |