diff options
Diffstat (limited to 'kernel')
46 files changed, 1477 insertions, 930 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 15ab63ffe64..54f69837d35 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/acct.c b/kernel/acct.c index 91e1cfd734d..dd68b905941 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock * can be placed in the same cache line as the lock. This primes * the cache line to have the data after getting the lock. */ -struct acct_glbs { - spinlock_t lock; +struct bsd_acct_struct { volatile int active; volatile int needcheck; struct file *file; struct pid_namespace *ns; struct timer_list timer; + struct list_head list; }; -static struct acct_glbs acct_globals __cacheline_aligned = - {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; +static DEFINE_SPINLOCK(acct_lock); +static LIST_HEAD(acct_list); /* * Called whenever the timer says to check the free space. */ -static void acct_timeout(unsigned long unused) +static void acct_timeout(unsigned long x) { - acct_globals.needcheck = 1; + struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; + acct->needcheck = 1; } /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct file *file) +static int check_free_space(struct bsd_acct_struct *acct, struct file *file) { struct kstatfs sbuf; int res; @@ -113,11 +115,11 @@ static int check_free_space(struct file *file) sector_t resume; sector_t suspend; - spin_lock(&acct_globals.lock); - res = acct_globals.active; - if (!file || !acct_globals.needcheck) + spin_lock(&acct_lock); + res = acct->active; + if (!file || !acct->needcheck) goto out; - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); /* May block */ if (vfs_statfs(file->f_path.dentry, &sbuf)) @@ -136,35 +138,35 @@ static int check_free_space(struct file *file) act = 0; /* - * If some joker switched acct_globals.file under us we'ld better be + * If some joker switched acct->file under us we'ld better be * silent and _not_ touch anything. */ - spin_lock(&acct_globals.lock); - if (file != acct_globals.file) { + spin_lock(&acct_lock); + if (file != acct->file) { if (act) res = act>0; goto out; } - if (acct_globals.active) { + if (acct->active) { if (act < 0) { - acct_globals.active = 0; + acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { if (act > 0) { - acct_globals.active = 1; + acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } - del_timer(&acct_globals.timer); - acct_globals.needcheck = 0; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); - res = acct_globals.active; + del_timer(&acct->timer); + acct->needcheck = 0; + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); + res = acct->active; out: - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return res; } @@ -172,39 +174,41 @@ out: * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * - * NOTE: acct_globals.lock MUST be held on entry and exit. + * NOTE: acct_lock MUST be held on entry and exit. */ -static void acct_file_reopen(struct file *file) +static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, + struct pid_namespace *ns) { struct file *old_acct = NULL; struct pid_namespace *old_ns = NULL; - if (acct_globals.file) { - old_acct = acct_globals.file; - old_ns = acct_globals.ns; - del_timer(&acct_globals.timer); - acct_globals.active = 0; - acct_globals.needcheck = 0; - acct_globals.file = NULL; + if (acct->file) { + old_acct = acct->file; + old_ns = acct->ns; + del_timer(&acct->timer); + acct->active = 0; + acct->needcheck = 0; + acct->file = NULL; + acct->ns = NULL; + list_del(&acct->list); } if (file) { - acct_globals.file = file; - acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); - acct_globals.needcheck = 0; - acct_globals.active = 1; + acct->file = file; + acct->ns = ns; + acct->needcheck = 0; + acct->active = 1; + list_add(&acct->list, &acct_list); /* It's been deleted if it was used before so this is safe */ - init_timer(&acct_globals.timer); - acct_globals.timer.function = acct_timeout; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); + setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); } if (old_acct) { mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_globals.lock); - do_acct_process(old_ns, old_acct); + spin_unlock(&acct_lock); + do_acct_process(acct, old_ns, old_acct); filp_close(old_acct, NULL); - put_pid_ns(old_ns); - spin_lock(&acct_globals.lock); + spin_lock(&acct_lock); } } @@ -212,6 +216,8 @@ static int acct_on(char *name) { struct file *file; int error; + struct pid_namespace *ns; + struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); @@ -228,18 +234,34 @@ static int acct_on(char *name) return -EIO; } + ns = task_active_pid_ns(current); + if (ns->bacct == NULL) { + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (acct == NULL) { + filp_close(file, NULL); + return -ENOMEM; + } + } + error = security_acct(file); if (error) { + kfree(acct); filp_close(file, NULL); return error; } - spin_lock(&acct_globals.lock); + spin_lock(&acct_lock); + if (ns->bacct == NULL) { + ns->bacct = acct; + acct = NULL; + } + mnt_pin(file->f_path.mnt); - acct_file_reopen(file); - spin_unlock(&acct_globals.lock); + acct_file_reopen(ns->bacct, file, ns); + spin_unlock(&acct_lock); mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ + kfree(acct); return 0; } @@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name) error = acct_on(tmp); putname(tmp); } else { + struct bsd_acct_struct *acct; + + acct = task_active_pid_ns(current)->bacct; + if (acct == NULL) + return 0; + error = security_acct(NULL); if (!error) { - spin_lock(&acct_globals.lock); - acct_file_reopen(NULL); - spin_unlock(&acct_globals.lock); + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); } } return error; @@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name) */ void acct_auto_close_mnt(struct vfsmount *m) { - spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_path.mnt == m) - acct_file_reopen(NULL); - spin_unlock(&acct_globals.lock); + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt == m) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); } /** @@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m) */ void acct_auto_close(struct super_block *sb) { - spin_lock(&acct_globals.lock); - if (acct_globals.file && - acct_globals.file->f_path.mnt->mnt_sb == sb) { - acct_file_reopen(NULL); + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} + +void acct_exit_ns(struct pid_namespace *ns) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); + acct = ns->bacct; + if (acct != NULL) { + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); + + kfree(acct); } - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); } /* @@ -425,7 +478,8 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct pid_namespace *ns, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file) * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(file)) + if (!check_free_space(acct, file)) return; /* @@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -/** - * acct_process - now just a wrapper around do_acct_process - * @exitcode: task exit code - * - * handles process accounting for an exiting task - */ -void acct_process(void) +static void acct_process_in_ns(struct pid_namespace *ns) { struct file *file = NULL; - struct pid_namespace *ns; + struct bsd_acct_struct *acct; + acct = ns->bacct; /* * accelerate the common fastpath: */ - if (!acct_globals.file) + if (!acct || !acct->file) return; - spin_lock(&acct_globals.lock); - file = acct_globals.file; + spin_lock(&acct_lock); + file = acct->file; if (unlikely(!file)) { - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return; } get_file(file); - ns = get_pid_ns(acct_globals.ns); - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); - do_acct_process(ns, file); + do_acct_process(acct, ns, file); fput(file); - put_pid_ns(ns); +} + +/** + * acct_process - now just a wrapper around acct_process_in_ns, + * which in turn is a wrapper around do_acct_process. + * + * handles process accounting for an exiting task + */ +void acct_process(void) +{ + struct pid_namespace *ns; + + /* + * This loop is safe lockless, since current is still + * alive and holds its namespace, which in turn holds + * its parent. + */ + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) + acct_process_in_ns(ns); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15ac0e1e4f4..657f8f8d93a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,6 +45,7 @@ #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hash.h> +#include <linux/namei.h> #include <asm/atomic.h> @@ -89,11 +90,7 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; - /* The path to use for release notifications. No locking - * between setting and use - so if userspace updates this - * while child cgroups exist, you could miss a - * notification. We ensure that it's always a valid - * NUL-terminated string */ + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; }; @@ -118,7 +115,7 @@ static int root_count; * extra work in the fork/exit path if none of the subsystems need to * be called. */ -static int need_forkexit_callback; +static int need_forkexit_callback __read_mostly; static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ @@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ -static int use_task_css_set_links; +static int use_task_css_set_links __read_mostly; /* When we create or destroy a css_set, the operation simply * takes/releases a reference count on all the cgroups referenced @@ -241,17 +238,20 @@ static int use_task_css_set_links; */ static void unlink_css_set(struct css_set *cg) { + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; - while (!list_empty(&cg->cg_links)) { - struct cg_cgroup_link *link; - link = list_entry(cg->cg_links.next, - struct cg_cgroup_link, cg_link_list); + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); } + write_unlock(&css_set_lock); } @@ -363,15 +363,14 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - while (!list_empty(tmp)) { - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + list_for_each_entry_safe(link, saved_link, tmp, + cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -384,11 +383,10 @@ static int allocate_cg_links(int count, struct list_head *tmp) static void free_cg_links(struct list_head *tmp) { - while (!list_empty(tmp)) { - struct cg_cgroup_link *link; - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -415,11 +413,11 @@ static struct css_set *find_css_set( /* First see if we already have a cgroup group that matches * the desired set */ - write_lock(&css_set_lock); + read_lock(&css_set_lock); res = find_existing_css_set(oldcg, cgrp, template); if (res) get_css_set(res); - write_unlock(&css_set_lock); + read_unlock(&css_set_lock); if (res) return res; @@ -507,10 +505,6 @@ static struct css_set *find_css_set( * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The cgroup_common_file_write handler for operations that modify - * the cgroup hierarchy holds cgroup_mutex across the entire operation, - * single threading all such cgroup modifications across the system. - * * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't * (usually) take cgroup_mutex. These are the two most performance * critical pieces of code here. The exception occurs on cgroup_exit(), @@ -1093,6 +1087,8 @@ static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; int ret; + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; BUG_ON(!root); @@ -1112,10 +1108,9 @@ static void cgroup_kill_sb(struct super_block *sb) { * root cgroup */ write_lock(&css_set_lock); - while (!list_empty(&cgrp->css_sets)) { - struct cg_cgroup_link *link; - link = list_entry(cgrp->css_sets.next, - struct cg_cgroup_link, cgrp_link_list); + + list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, + cgrp_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); @@ -1281,18 +1276,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with - * cgroup_mutex, may take task_lock of task + * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex + * held. May take task_lock of task */ -static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { - pid_t pid; struct task_struct *tsk; int ret; - if (sscanf(pidbuf, "%d", &pid) != 1) - return -EIO; - if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); @@ -1318,6 +1309,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) return ret; } +static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + ret = attach_task_by_pid(cgrp, pid); + cgroup_unlock(); + return ret; +} + /* The various types of files and directories in a cgroup file system */ enum cgroup_filetype { FILE_ROOT, @@ -1327,12 +1328,54 @@ enum cgroup_filetype { FILE_RELEASE_AGENT, }; +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the lock should be later released with + * cgroup_unlock(). On failure returns false with no lock held. + */ +bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + +static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + strcpy(cgrp->root->release_agent_path, buffer); + cgroup_unlock(); + return 0; +} + +static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + seq_puts(seq, cgrp->root->release_agent_path); + seq_putc(seq, '\n'); + cgroup_unlock(); + return 0; +} + +/* A buffer size big enough for numbers or short strings */ +#define CGROUP_LOCAL_BUFFER_SIZE 64 + static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - char buffer[64]; + char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; char *end; @@ -1361,68 +1404,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return retval; } -static ssize_t cgroup_common_file_write(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { - enum cgroup_filetype type = cft->private; - char *buffer; + char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; + size_t max_bytes = cft->max_write_len; + char *buffer = local_buffer; - if (nbytes >= PATH_MAX) + if (!max_bytes) + max_bytes = sizeof(local_buffer) - 1; + if (nbytes >= max_bytes) return -E2BIG; - - /* +1 for nul-terminator */ - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - - if (copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out1; + /* Allocate a dynamic buffer if we need one */ + if (nbytes >= sizeof(local_buffer)) { + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; } - buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); /* strip -just- trailing whitespace */ - - mutex_lock(&cgroup_mutex); + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; - /* - * This was already checked for in cgroup_file_write(), but - * check again now we're holding cgroup_mutex. - */ - if (cgroup_is_removed(cgrp)) { - retval = -ENODEV; - goto out2; - } - - switch (type) { - case FILE_TASKLIST: - retval = attach_task_by_pid(cgrp, buffer); - break; - case FILE_NOTIFY_ON_RELEASE: - clear_bit(CGRP_RELEASABLE, &cgrp->flags); - if (simple_strtoul(buffer, NULL, 10) != 0) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - break; - case FILE_RELEASE_AGENT: - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - strcpy(cgrp->root->release_agent_path, buffer); - break; - default: - retval = -EINVAL; - goto out2; - } - - if (retval == 0) + buffer[nbytes] = 0; /* nul-terminate */ + strstrip(buffer); + retval = cft->write_string(cgrp, cft, buffer); + if (!retval) retval = nbytes; -out2: - mutex_unlock(&cgroup_mutex); -out1: - kfree(buffer); + if (buffer != local_buffer) + kfree(buffer); return retval; } @@ -1438,6 +1449,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_string) + return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); if (cft->trigger) { int ret = cft->trigger(cgrp, (unsigned int)cft->private); return ret ? ret : nbytes; @@ -1450,7 +1463,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, char __user *buf, size_t nbytes, loff_t *ppos) { - char tmp[64]; + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); @@ -1462,56 +1475,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct |