From 0415b00d175e0d8945e6785aad21b5f157976ce0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Mar 2011 18:50:09 +0100 Subject: percpu: Always align percpu output section to PAGE_SIZE Percpu allocator honors alignment request upto PAGE_SIZE and both the percpu addresses in the percpu address space and the translated kernel addresses should be aligned accordingly. The calculation of the former depends on the alignment of percpu output section in the kernel image. The linker script macros PERCPU_VADDR() and PERCPU() are used to define this output section and the latter takes @align parameter. Several architectures are using @align smaller than PAGE_SIZE breaking percpu memory alignment. This patch removes @align parameter from PERCPU(), renames it to PERCPU_SECTION() and makes it always align to PAGE_SIZE. While at it, add PCPU_SETUP_BUG_ON() checks such that alignment problems are reliably detected and remove percpu alignment comment recently added in workqueue.c as the condition would trigger BUG way before reaching there. For um, this patch raises the alignment of percpu area. As the area is in .init, there shouldn't be any noticeable difference. This problem was discovered by David Howells while debugging boot failure on mn10300. Signed-off-by: Tejun Heo Acked-by: Mike Frysinger Cc: uclinux-dist-devel@blackfin.uclinux.org Cc: David Howells Cc: Jeff Dike Cc: user-mode-linux-devel@lists.sourceforge.net --- kernel/workqueue.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04ef830690e..d30a502e8c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2860,9 +2860,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned - * - this is affected by PERCPU() alignment in vmlinux.lds.S - */ + /* just in case, make sure it's actually aligned */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 17f60a7da150fdd0cfb9756f86a262daa72c835f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:07:50 -0400 Subject: capabilites: allow the application of capability limits to usermode helpers There is no way to limit the capabilities of usermodehelpers. This problem reared its head recently when someone complained that any user with cap_net_admin was able to load arbitrary kernel modules, even though the user didn't have cap_sys_module. The reason is because the actual load is done by a usermode helper and those always have the full cap set. This patch addes new sysctls which allow us to bound the permissions of usermode helpers. /proc/sys/kernel/usermodehelper/bset /proc/sys/kernel/usermodehelper/inheritable You must have CAP_SYS_MODULE and CAP_SETPCAP to change these (changes are &= ONLY). When the kernel launches a usermodehelper it will do so with these as the bset and pI. -v2: make globals static create spinlock to protect globals -v3: require both CAP_SETPCAP and CAP_SYS_MODULE -v4: fix the typo s/CAP_SET_PCAP/CAP_SETPCAP/ because I didn't commit Signed-off-by: Eric Paris No-objection-from: Serge E. Hallyn Acked-by: David Howells Acked-by: Serge E. Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/kmod.h | 3 ++ kernel/kmod.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 6 ++++ 3 files changed, 109 insertions(+) (limited to 'kernel') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 6efd7a78de6..79bb98d7185 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -24,6 +24,7 @@ #include #include #include +#include #define KMOD_PATH_LEN 256 @@ -109,6 +110,8 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) NULL, NULL, NULL); } +extern struct ctl_table usermodehelper_table[]; + extern void usermodehelper_init(void); extern int usermodehelper_disable(void); diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a..06fdea2819b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); + #ifdef CONFIG_MODULES /* @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + struct cred *new; int retval; spin_lock_irq(¤t->sighand->siglock); @@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data) goto fail; } + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto fail; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + commit_creds(new); + retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -418,6 +440,84 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b1..965134bed6c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -615,6 +616,11 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = random_table, }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3-70-g09d2 From ffa8e59df047d57e812a04f7d6baf6a25c652c0c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:34 -0400 Subject: capabilities: do not drop CAP_SETPCAP from the initial task In olden' days of yore CAP_SETPCAP had special meaning for the init task. We actually have code to make sure that CAP_SETPCAP wasn't in pE of things using the init_cred. But CAP_SETPCAP isn't so special any more and we don't have a reason to special case dropping it for init or kthreads.... Signed-off-by: Eric Paris Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/capability.h | 6 ++++-- kernel/capability.c | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 16ee8b49a20..11d562863e4 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -412,7 +412,6 @@ extern const kernel_cap_t __cap_init_eff_set; # define CAP_EMPTY_SET ((kernel_cap_t){{ 0, 0 }}) # define CAP_FULL_SET ((kernel_cap_t){{ ~0, ~0 }}) -# define CAP_INIT_EFF_SET ((kernel_cap_t){{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}) # define CAP_FS_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \ CAP_FS_MASK_B1 } }) @@ -423,10 +422,10 @@ extern const kernel_cap_t __cap_init_eff_set; #endif /* _KERNEL_CAPABILITY_U32S != 2 */ #define CAP_INIT_INH_SET CAP_EMPTY_SET +#define CAP_INIT_EFF_SET CAP_FULL_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) # define cap_set_full(c) do { (c) = __cap_full_set; } while (0) -# define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) @@ -547,6 +546,9 @@ extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); +extern const kernel_cap_t __cap_empty_set; +extern const kernel_cap_t __cap_full_set; + /** * nsown_capable - Check superior capability to one's own user_ns * @cap: The capability in question diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734d0c1..2a374d512ea 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -23,11 +23,9 @@ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); int file_caps_enabled = 1; -- cgit v1.2.3-70-g09d2 From 5163b583a036b103c3cec7171d6731c125773ed6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:39 -0400 Subject: capabilities: delete unused cap_set_full unused code. Clean it up. Signed-off-by: Eric Paris Acked-by: David Howells Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/capability.h | 2 -- kernel/capability.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 11d562863e4..8d0da30dad2 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -425,7 +425,6 @@ extern const kernel_cap_t __cap_init_eff_set; #define CAP_INIT_EFF_SET CAP_FULL_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) -# define cap_set_full(c) do { (c) = __cap_full_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) @@ -547,7 +546,6 @@ extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); extern const kernel_cap_t __cap_empty_set; -extern const kernel_cap_t __cap_full_set; /** * nsown_capable - Check superior capability to one's own user_ns diff --git a/kernel/capability.c b/kernel/capability.c index 2a374d512ea..14ea4210a53 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -22,10 +22,8 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); int file_caps_enabled = 1; -- cgit v1.2.3-70-g09d2 From a3232d2fa2e3cbab3e76d91cdae5890fee8a4034 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:45 -0400 Subject: capabilities: delete all CAP_INIT macros The CAP_INIT macros of INH, BSET, and EFF made sense at one point in time, but now days they aren't helping. Just open code the logic in the init_cred. Signed-off-by: Eric Paris Acked-by: David Howells Signed-off-by: James Morris --- include/linux/capability.h | 3 --- include/linux/init_task.h | 7 ------- kernel/cred.c | 6 +++--- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 8d0da30dad2..04fed72809d 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -421,9 +421,6 @@ extern const kernel_cap_t __cap_init_eff_set; #endif /* _KERNEL_CAPABILITY_U32S != 2 */ -#define CAP_INIT_INH_SET CAP_EMPTY_SET -#define CAP_INIT_EFF_SET CAP_FULL_SET - # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index caa151fbebb..1f277204de3 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -83,13 +83,6 @@ extern struct group_info init_groups; #define INIT_IDS #endif -/* - * Because of the reduced scope of CAP_SETPCAP when filesystem - * capabilities are in effect, it is safe to allow CAP_SETPCAP to - * be available in the default configuration. - */ -# define CAP_INIT_BSET CAP_FULL_SET - #ifdef CONFIG_RCU_BOOST #define INIT_TASK_RCU_BOOST() \ .rcu_boost_mutex = NULL, diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55048d..b982f0863ae 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,10 +49,10 @@ struct cred init_cred = { .magic = CRED_MAGIC, #endif .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_INIT_INH_SET, + .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_INIT_EFF_SET, - .cap_bset = CAP_INIT_BSET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, .user = INIT_USER, .group_info = &init_groups, #ifdef CONFIG_KEYS -- cgit v1.2.3-70-g09d2 From 0663c6f8fa37d777ede74ff991a0cba3a42fcbd7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 17:48:52 -0800 Subject: ns: Introduce the setns syscall With the networking stack today there is demand to handle multiple network stacks at a time. Not in the context of containers but in the context of people doing interesting things with routing. There is also demand in the context of containers to have an efficient way to execute some code in the container itself. If nothing else it is very useful ad a debugging technique. Both problems can be solved by starting some form of login daemon in the namespaces people want access to, or you can play games by ptracing a process and getting the traced process to do things you want it to do. However it turns out that a login daemon or a ptrace puppet controller are more code, they are more prone to failure, and generally they are less efficient than simply changing the namespace of a process to a specified one. Pieces of this puzzle can also be solved by instead of coming up with a general purpose system call coming up with targed system calls perhaps socketat that solve a subset of the larger problem. Overall that appears to be more work for less reward. int setns(int fd, int nstype); The fd argument is a file descriptor referring to a proc file of the namespace you want to switch the process to. In the setns system call the nstype is 0 or specifies an clone flag of the namespace you intend to change to prevent changing a namespace unintentionally. v2: Most of the architecture support added by Daniel Lezcano v3: ported to v2.6.36-rc4 by: Eric W. Biederman v4: Moved wiring up of the system call to another patch v5: Cleaned up the system call arguments - Changed the order. - Modified nstype to take the standard clone flags. v6: Added missing error handling as pointed out by Matt Helsley Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- kernel/nsproxy.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd..5424e37673e 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include static struct kmem_cache *nsproxy_cachep; @@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); -- cgit v1.2.3-70-g09d2 From 34482e89a5218f0f9317abf1cfba3bb38b5c29dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:43:27 -0800 Subject: ns proc: Add support for the uts namespace Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ include/linux/proc_fs.h | 1 + kernel/utsname.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dcbd483e991..b017181f127 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -19,6 +19,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif }; static const struct file_operations ns_file_operations = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 62126ec6ede..52aa89df8a6 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -266,6 +266,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, void *ns); }; extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; union proc_op { int (*proc_get_link)(struct inode *, struct path *); diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eab..bff131b9510 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct uts_namespace *create_uts_ns(void) { @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) put_user_ns(ns->user_ns); kfree(ns); } + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + -- cgit v1.2.3-70-g09d2 From be84cb43833ee40a42e08f5425d20310f16229c7 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 9 May 2011 13:12:30 -0400 Subject: compat: fixes to allow working with tile arch The existing mechanism doesn't really provide enough to create the 64-bit "compat" ABI properly in a generic way, since the compat ABI is a mix of things were you can re-use the 64-bit versions of syscalls and things where you need a compat wrapper. To provide this in the most direct way possible, I added two new macros to go along with the existing __SYSCALL and __SC_3264 macros: __SC_COMP and SC_COMP_3264. These macros take an additional argument, typically a "compat_sys_xxx" function, which is passed to __SYSCALL if you define __SYSCALL_COMPAT when including the header, resulting in a pointer to the compat function being placed in the generated syscall table. The change also adds some missing definitions to so that it actually has declarations for all the compat syscalls, since the "[nr] = ##call" approach requires proper C declarations for all the functions included in the syscall table. Finally, compat.c defines compat_sys_sigpending() and compat_sys_sigprocmask() even if the underlying architecture doesn't request it, which tries to pull in undefined compat_old_sigset_t defines. We need to guard those compat syscall definitions with appropriate __ARCH_WANT_SYS_xxx ifdefs. Acked-by: Arnd Bergmann Signed-off-by: Chris Metcalf --- arch/tile/kernel/compat.c | 2 +- include/asm-generic/unistd.h | 221 ++++++++++++++++++++++++------------------- include/linux/compat.h | 187 ++++++++++++++++++++++++++++++++++++ kernel/compat.c | 8 ++ 4 files changed, 320 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c index dbc213adf5e..e35a3975ca3 100644 --- a/arch/tile/kernel/compat.c +++ b/arch/tile/kernel/compat.c @@ -135,7 +135,7 @@ long tile_compat_sys_msgrcv(int msqid, /* Provide the compat syscall number to call mapping. */ #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (compat_##call), +#define __SYSCALL(nr, call) [nr] = (call), /* The generic versions of these don't work for Tile. */ #define compat_sys_msgrcv tile_compat_sys_msgrcv diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 07c40d5149d..33d52470488 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -24,16 +24,24 @@ #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64) #endif +#ifdef __SYSCALL_COMPAT +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp) +#else +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64) +#endif + #define __NR_io_setup 0 -__SYSCALL(__NR_io_setup, sys_io_setup) +__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup) #define __NR_io_destroy 1 __SYSCALL(__NR_io_destroy, sys_io_destroy) #define __NR_io_submit 2 -__SYSCALL(__NR_io_submit, sys_io_submit) +__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) #define __NR_io_getevents 4 -__SYSCALL(__NR_io_getevents, sys_io_getevents) +__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents) /* fs/xattr.c */ #define __NR_setxattr 5 @@ -67,7 +75,7 @@ __SYSCALL(__NR_getcwd, sys_getcwd) /* fs/cookies.c */ #define __NR_lookup_dcookie 18 -__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) +__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) /* fs/eventfd.c */ #define __NR_eventfd2 19 @@ -79,7 +87,7 @@ __SYSCALL(__NR_epoll_create1, sys_epoll_create1) #define __NR_epoll_ctl 21 __SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) #define __NR_epoll_pwait 22 -__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) +__SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait) /* fs/fcntl.c */ #define __NR_dup 23 @@ -87,7 +95,7 @@ __SYSCALL(__NR_dup, sys_dup) #define __NR_dup3 24 __SYSCALL(__NR_dup3, sys_dup3) #define __NR3264_fcntl 25 -__SC_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl) +__SC_COMP_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl, compat_sys_fcntl64) /* fs/inotify_user.c */ #define __NR_inotify_init1 26 @@ -99,7 +107,7 @@ __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) /* fs/ioctl.c */ #define __NR_ioctl 29 -__SYSCALL(__NR_ioctl, sys_ioctl) +__SC_COMP(__NR_ioctl, sys_ioctl, compat_sys_ioctl) /* fs/ioprio.c */ #define __NR_ioprio_set 30 @@ -129,26 +137,30 @@ __SYSCALL(__NR_renameat, sys_renameat) #define __NR_umount2 39 __SYSCALL(__NR_umount2, sys_umount) #define __NR_mount 40 -__SYSCALL(__NR_mount, sys_mount) +__SC_COMP(__NR_mount, sys_mount, compat_sys_mount) #define __NR_pivot_root 41 __SYSCALL(__NR_pivot_root, sys_pivot_root) /* fs/nfsctl.c */ #define __NR_nfsservctl 42 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SC_COMP(__NR_nfsservctl, sys_nfsservctl, compat_sys_nfsservctl) /* fs/open.c */ #define __NR3264_statfs 43 -__SC_3264(__NR3264_statfs, sys_statfs64, sys_statfs) +__SC_COMP_3264(__NR3264_statfs, sys_statfs64, sys_statfs, \ + compat_sys_statfs64) #define __NR3264_fstatfs 44 -__SC_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs) +__SC_COMP_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs, \ + compat_sys_fstatfs64) #define __NR3264_truncate 45 -__SC_3264(__NR3264_truncate, sys_truncate64, sys_truncate) +__SC_COMP_3264(__NR3264_truncate, sys_truncate64, sys_truncate, \ + compat_sys_truncate64) #define __NR3264_ftruncate 46 -__SC_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate) +__SC_COMP_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate, \ + compat_sys_ftruncate64) #define __NR_fallocate 47 -__SYSCALL(__NR_fallocate, sys_fallocate) +__SC_COMP(__NR_fallocate, sys_fallocate, compat_sys_fallocate) #define __NR_faccessat 48 __SYSCALL(__NR_faccessat, sys_faccessat) #define __NR_chdir 49 @@ -166,7 +178,7 @@ __SYSCALL(__NR_fchownat, sys_fchownat) #define __NR_fchown 55 __SYSCALL(__NR_fchown, sys_fchown) #define __NR_openat 56 -__SYSCALL(__NR_openat, sys_openat) +__SC_COMP(__NR_openat, sys_openat, compat_sys_openat) #define __NR_close 57 __SYSCALL(__NR_close, sys_close) #define __NR_vhangup 58 @@ -182,7 +194,7 @@ __SYSCALL(__NR_quotactl, sys_quotactl) /* fs/readdir.c */ #define __NR_getdents64 61 -__SYSCALL(__NR_getdents64, sys_getdents64) +__SC_COMP(__NR_getdents64, sys_getdents64, compat_sys_getdents64) /* fs/read_write.c */ #define __NR3264_lseek 62 @@ -192,17 +204,17 @@ __SYSCALL(__NR_read, sys_read) #define __NR_write 64 __SYSCALL(__NR_write, sys_write) #define __NR_readv 65 -__SYSCALL(__NR_readv, sys_readv) +__SC_COMP(__NR_readv, sys_readv, compat_sys_readv) #define __NR_writev 66 -__SYSCALL(__NR_writev, sys_writev) +__SC_COMP(__NR_writev, sys_writev, compat_sys_writev) #define __NR_pread64 67 -__SYSCALL(__NR_pread64, sys_pread64) +__SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64) #define __NR_pwrite64 68 -__SYSCALL(__NR_pwrite64, sys_pwrite64) +__SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64) #define __NR_preadv 69 -__SYSCALL(__NR_preadv, sys_preadv) +__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv) #define __NR_pwritev 70 -__SYSCALL(__NR_pwritev, sys_pwritev) +__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev) /* fs/sendfile.c */ #define __NR3264_sendfile 71 @@ -210,17 +222,17 @@ __SC_3264(__NR3264_sendfile, sys_sendfile64, sys_sendfile) /* fs/select.c */ #define __NR_pselect6 72 -__SYSCALL(__NR_pselect6, sys_pselect6) +__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6) #define __NR_ppoll 73 -__SYSCALL(__NR_ppoll, sys_ppoll) +__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll) /* fs/signalfd.c */ #define __NR_signalfd4 74 -__SYSCALL(__NR_signalfd4, sys_signalfd4) +__SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4) /* fs/splice.c */ #define __NR_vmsplice 75 -__SYSCALL(__NR_vmsplice, sys_vmsplice) +__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice) #define __NR_splice 76 __SYSCALL(__NR_splice, sys_splice) #define __NR_tee 77 @@ -243,23 +255,27 @@ __SYSCALL(__NR_fsync, sys_fsync) __SYSCALL(__NR_fdatasync, sys_fdatasync) #ifdef __ARCH_WANT_SYNC_FILE_RANGE2 #define __NR_sync_file_range2 84 -__SYSCALL(__NR_sync_file_range2, sys_sync_file_range2) +__SC_COMP(__NR_sync_file_range2, sys_sync_file_range2, \ + compat_sys_sync_file_range2) #else #define __NR_sync_file_range 84 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) +__SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ + compat_sys_sync_file_range) #endif /* fs/timerfd.c */ #define __NR_timerfd_create 85 __SYSCALL(__NR_timerfd_create, sys_timerfd_create) #define __NR_timerfd_settime 86 -__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) +__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ + compat_sys_timerfd_settime) #define __NR_timerfd_gettime 87 -__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) +__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ + compat_sys_timerfd_gettime) /* fs/utimes.c */ #define __NR_utimensat 88 -__SYSCALL(__NR_utimensat, sys_utimensat) +__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat) /* kernel/acct.c */ #define __NR_acct 89 @@ -281,7 +297,7 @@ __SYSCALL(__NR_exit, sys_exit) #define __NR_exit_group 94 __SYSCALL(__NR_exit_group, sys_exit_group) #define __NR_waitid 95 -__SYSCALL(__NR_waitid, sys_waitid) +__SC_COMP(__NR_waitid, sys_waitid, compat_sys_waitid) /* kernel/fork.c */ #define __NR_set_tid_address 96 @@ -291,25 +307,27 @@ __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ #define __NR_futex 98 -__SYSCALL(__NR_futex, sys_futex) +__SC_COMP(__NR_futex, sys_futex, compat_sys_futex) #define __NR_set_robust_list 99 -__SYSCALL(__NR_set_robust_list, sys_set_robust_list) +__SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ + compat_sys_set_robust_list) #define __NR_get_robust_list 100 -__SYSCALL(__NR_get_robust_list, sys_get_robust_list) +__SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ + compat_sys_get_robust_list) /* kernel/hrtimer.c */ #define __NR_nanosleep 101 -__SYSCALL(__NR_nanosleep, sys_nanosleep) +__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep) /* kernel/itimer.c */ #define __NR_getitimer 102 -__SYSCALL(__NR_getitimer, sys_getitimer) +__SC_COMP(__NR_getitimer, sys_getitimer, compat_sys_getitimer) #define __NR_setitimer 103 -__SYSCALL(__NR_setitimer, sys_setitimer) +__SC_COMP(__NR_setitimer, sys_setitimer, compat_sys_setitimer) /* kernel/kexec.c */ #define __NR_kexec_load 104 -__SYSCALL(__NR_kexec_load, sys_kexec_load) +__SC_COMP(__NR_kexec_load, sys_kexec_load, compat_sys_kexec_load) /* kernel/module.c */ #define __NR_init_module 105 @@ -319,23 +337,24 @@ __SYSCALL(__NR_delete_module, sys_delete_module) /* kernel/posix-timers.c */ #define __NR_timer_create 107 -__SYSCALL(__NR_timer_create, sys_timer_create) +__SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) #define __NR_timer_gettime 108 -__SYSCALL(__NR_timer_gettime, sys_timer_gettime) +__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime) #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) #define __NR_timer_settime 110 -__SYSCALL(__NR_timer_settime, sys_timer_settime) +__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime) #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) #define __NR_clock_settime 112 -__SYSCALL(__NR_clock_settime, sys_clock_settime) +__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime) #define __NR_clock_gettime 113 -__SYSCALL(__NR_clock_gettime, sys_clock_gettime) +__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime) #define __NR_clock_getres 114 -__SYSCALL(__NR_clock_getres, sys_clock_getres) +__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres) #define __NR_clock_nanosleep 115 -__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) +__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ + compat_sys_clock_nanosleep) /* kernel/printk.c */ #define __NR_syslog 116 @@ -355,9 +374,11 @@ __SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) #define __NR_sched_getparam 121 __SYSCALL(__NR_sched_getparam, sys_sched_getparam) #define __NR_sched_setaffinity 122 -__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) +__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \ + compat_sys_sched_setaffinity) #define __NR_sched_getaffinity 123 -__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) +__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \ + compat_sys_sched_getaffinity) #define __NR_sched_yield 124 __SYSCALL(__NR_sched_yield, sys_sched_yield) #define __NR_sched_get_priority_max 125 @@ -365,7 +386,8 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 126 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) #define __NR_sched_rr_get_interval 127 -__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) +__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ + compat_sys_sched_rr_get_interval) /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -377,21 +399,23 @@ __SYSCALL(__NR_tkill, sys_tkill) #define __NR_tgkill 131 __SYSCALL(__NR_tgkill, sys_tgkill) #define __NR_sigaltstack 132 -__SYSCALL(__NR_sigaltstack, sys_sigaltstack) +__SC_COMP(__NR_sigaltstack, sys_sigaltstack, compat_sys_sigaltstack) #define __NR_rt_sigsuspend 133 -__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +__SC_COMP(__NR_rt_sigsuspend, sys_rt_sigsuspend, compat_sys_rt_sigsuspend) #define __NR_rt_sigaction 134 -__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) /* __ARCH_WANT_SYS_RT_SIGACTION */ +__SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction) #define __NR_rt_sigprocmask 135 __SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) #define __NR_rt_sigpending 136 __SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) #define __NR_rt_sigtimedwait 137 -__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) +__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ + compat_sys_rt_sigtimedwait) #define __NR_rt_sigqueueinfo 138 -__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) +__SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ + compat_sys_rt_sigqueueinfo) #define __NR_rt_sigreturn 139 -__SYSCALL(__NR_rt_sigreturn, sys_rt_sigreturn) /* sys_rt_sigreturn_wrapper, */ +__SC_COMP(__NR_rt_sigreturn, sys_rt_sigreturn, compat_sys_rt_sigreturn) /* kernel/sys.c */ #define __NR_setpriority 140 @@ -421,7 +445,7 @@ __SYSCALL(__NR_setfsuid, sys_setfsuid) #define __NR_setfsgid 152 __SYSCALL(__NR_setfsgid, sys_setfsgid) #define __NR_times 153 -__SYSCALL(__NR_times, sys_times) +__SC_COMP(__NR_times, sys_times, compat_sys_times) #define __NR_setpgid 154 __SYSCALL(__NR_setpgid, sys_setpgid) #define __NR_getpgid 155 @@ -441,11 +465,11 @@ __SYSCALL(__NR_sethostname, sys_sethostname) #define __NR_setdomainname 162 __SYSCALL(__NR_setdomainname, sys_setdomainname) #define __NR_getrlimit 163 -__SYSCALL(__NR_getrlimit, sys_getrlimit) +__SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit) #define __NR_setrlimit 164 -__SYSCALL(__NR_setrlimit, sys_setrlimit) +__SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit) #define __NR_getrusage 165 -__SYSCALL(__NR_getrusage, sys_getrusage) +__SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage) #define __NR_umask 166 __SYSCALL(__NR_umask, sys_umask) #define __NR_prctl 167 @@ -455,11 +479,11 @@ __SYSCALL(__NR_getcpu, sys_getcpu) /* kernel/time.c */ #define __NR_gettimeofday 169 -__SYSCALL(__NR_gettimeofday, sys_gettimeofday) +__SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 -__SYSCALL(__NR_settimeofday, sys_settimeofday) +__SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 -__SYSCALL(__NR_adjtimex, sys_adjtimex) +__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex) /* kernel/timer.c */ #define __NR_getpid 172 @@ -477,39 +501,40 @@ __SYSCALL(__NR_getegid, sys_getegid) #define __NR_gettid 178 __SYSCALL(__NR_gettid, sys_gettid) #define __NR_sysinfo 179 -__SYSCALL(__NR_sysinfo, sys_sysinfo) +__SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo) /* ipc/mqueue.c */ #define __NR_mq_open 180 -__SYSCALL(__NR_mq_open, sys_mq_open) +__SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 182 -__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) +__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend) #define __NR_mq_timedreceive 183 -__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) +__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ + compat_sys_mq_timedreceive) #define __NR_mq_notify 184 -__SYSCALL(__NR_mq_notify, sys_mq_notify) +__SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 -__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) +__SC_COMP(__NR_mq_getsetattr, sys_mq_getsetattr, compat_sys_mq_getsetattr) /* ipc/msg.c */ #define __NR_msgget 186 __SYSCALL(__NR_msgget, sys_msgget) #define __NR_msgctl 187 -__SYSCALL(__NR_msgctl, sys_msgctl) +__SC_COMP(__NR_msgctl, sys_msgctl, compat_sys_msgctl) #define __NR_msgrcv 188 -__SYSCALL(__NR_msgrcv, sys_msgrcv) +__SC_COMP(__NR_msgrcv, sys_msgrcv, compat_sys_msgrcv) #define __NR_msgsnd 189 -__SYSCALL(__NR_msgsnd, sys_msgsnd) +__SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd) /* ipc/sem.c */ #define __NR_semget 190 __SYSCALL(__NR_semget, sys_semget) #define __NR_semctl 191 -__SYSCALL(__NR_semctl, sys_semctl) +__SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) #define __NR_semtimedop 192 -__SYSCALL(__NR_semtimedop, sys_semtimedop) +__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop) #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) @@ -517,9 +542,9 @@ __SYSCALL(__NR_semop, sys_semop) #define __NR_shmget 194 __SYSCALL(__NR_shmget, sys_shmget) #define __NR_shmctl 195 -__SYSCALL(__NR_shmctl, sys_shmctl) +__SC_COMP(__NR_shmctl, sys_shmctl, compat_sys_shmctl) #define __NR_shmat 196 -__SYSCALL(__NR_shmat, sys_shmat) +__SC_COMP(__NR_shmat, sys_shmat, compat_sys_shmat) #define __NR_shmdt 197 __SYSCALL(__NR_shmdt, sys_shmdt) @@ -543,21 +568,21 @@ __SYSCALL(__NR_getpeername, sys_getpeername) #define __NR_sendto 206 __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 -__SYSCALL(__NR_recvfrom, sys_recvfrom) +__SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SYSCALL(__NR_setsockopt, sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) #define __NR_getsockopt 209 -__SYSCALL(__NR_getsockopt, sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 -__SYSCALL(__NR_sendmsg, sys_sendmsg) +__SC_COMP(__NR_sendmsg, sys_sendmsg, compat_sys_sendmsg) #define __NR_recvmsg 212 -__SYSCALL(__NR_recvmsg, sys_recvmsg) +__SC_COMP(__NR_recvmsg, sys_recvmsg, compat_sys_recvmsg) /* mm/filemap.c */ #define __NR_readahead 213 -__SYSCALL(__NR_readahead, sys_readahead) +__SC_COMP(__NR_readahead, sys_readahead, compat_sys_readahead) /* mm/nommu.c, also with MMU */ #define __NR_brk 214 @@ -573,19 +598,19 @@ __SYSCALL(__NR_add_key, sys_add_key) #define __NR_request_key 218 __SYSCALL(__NR_request_key, sys_request_key) #define __NR_keyctl 219 -__SYSCALL(__NR_keyctl, sys_keyctl) +__SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl) /* arch/example/kernel/sys_example.c */ #define __NR_clone 220 -__SYSCALL(__NR_clone, sys_clone) /* .long sys_clone_wrapper */ +__SYSCALL(__NR_clone, sys_clone) #define __NR_execve 221 -__SYSCALL(__NR_execve, sys_execve) /* .long sys_execve_wrapper */ +__SC_COMP(__NR_execve, sys_execve, compat_sys_execve) #define __NR3264_mmap 222 __SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap) /* mm/fadvise.c */ #define __NR3264_fadvise64 223 -__SYSCALL(__NR3264_fadvise64, sys_fadvise64_64) +__SC_COMP(__NR3264_fadvise64, sys_fadvise64_64, compat_sys_fadvise64_64) /* mm/, CONFIG_MMU only */ #ifndef __ARCH_NOMMU @@ -612,25 +637,26 @@ __SYSCALL(__NR_madvise, sys_madvise) #define __NR_remap_file_pages 234 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) #define __NR_mbind 235 -__SYSCALL(__NR_mbind, sys_mbind) +__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) #define __NR_get_mempolicy 236 -__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) +__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) #define __NR_set_mempolicy 237 -__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) +__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) #define __NR_migrate_pages 238 -__SYSCALL(__NR_migrate_pages, sys_migrate_pages) +__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) #define __NR_move_pages 239 -__SYSCALL(__NR_move_pages, sys_move_pages) +__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) #endif #define __NR_rt_tgsigqueueinfo 240 -__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) +__SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \ + compat_sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 241 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 -__SYSCALL(__NR_recvmmsg, sys_recvmmsg) +__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) /* * Architectures may provide up to 16 syscalls of their own @@ -639,19 +665,20 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __NR_arch_specific_syscall 244 #define __NR_wait4 260 -__SYSCALL(__NR_wait4, sys_wait4) +__SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4) #define __NR_prlimit64 261 __SYSCALL(__NR_prlimit64, sys_prlimit64) #define __NR_fanotify_init 262 __SYSCALL(__NR_fanotify_init, sys_fanotify_init) #define __NR_fanotify_mark 263 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) -#define __NR_name_to_handle_at 264 +#define __NR_name_to_handle_at 264 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) -#define __NR_open_by_handle_at 265 -__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#define __NR_open_by_handle_at 265 +__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ + compat_sys_open_by_handle_at) #define __NR_clock_adjtime 266 -__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime) #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) diff --git a/include/linux/compat.h b/include/linux/compat.h index 5778b559d59..e94184834b7 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -209,6 +210,18 @@ struct compat_robust_list_head { compat_uptr_t list_op_pending; }; +struct compat_statfs; +struct compat_statfs64; +struct compat_old_linux_dirent; +struct compat_linux_dirent; +struct linux_dirent64; +struct compat_msghdr; +struct compat_mmsghdr; +struct compat_sysinfo; +struct compat_sysctl_args; +struct compat_kexec_segment; +struct compat_mq_attr; + extern void compat_exit_robust_list(struct task_struct *curr); asmlinkage long @@ -331,9 +344,13 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); +asmlinkage long compat_sys_utime(const char __user *filename, + struct compat_utimbuf __user *t); asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags); +asmlinkage long compat_sys_time(compat_time_t __user *tloc); +asmlinkage long compat_sys_stime(compat_time_t __user *tptr); asmlinkage long compat_sys_signalfd(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); @@ -350,11 +367,181 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, int flags); asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t); +asmlinkage long compat_sys_utimes(const char __user *filename, + struct compat_timeval __user *t); +asmlinkage long compat_sys_newstat(const char __user * filename, + struct compat_stat __user *statbuf); +asmlinkage long compat_sys_newlstat(const char __user * filename, + struct compat_stat __user *statbuf); asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user * filename, struct compat_stat __user *statbuf, int flag); +asmlinkage long compat_sys_newfstat(unsigned int fd, + struct compat_stat __user * statbuf); +asmlinkage long compat_sys_statfs(const char __user *pathname, + struct compat_statfs __user *buf); +asmlinkage long compat_sys_fstatfs(unsigned int fd, + struct compat_statfs __user *buf); +asmlinkage long compat_sys_statfs64(const char __user *pathname, + compat_size_t sz, + struct compat_statfs64 __user *buf); +asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, + struct compat_statfs64 __user *buf); +asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); +asmlinkage long compat_sys_io_getevents(aio_context_t ctx_id, + unsigned long min_nr, + unsigned long nr, + struct io_event __user *events, + struct compat_timespec __user *timeout); +asmlinkage long compat_sys_io_submit(aio_context_t ctx_id, int nr, + u32 __user *iocb); +asmlinkage long compat_sys_mount(const char __user * dev_name, + const char __user * dir_name, + const char __user * type, unsigned long flags, + const void __user * data); +asmlinkage long compat_sys_old_readdir(unsigned int fd, + struct compat_old_linux_dirent __user *, + unsigned int count); +asmlinkage long compat_sys_getdents(unsigned int fd, + struct compat_linux_dirent __user *dirent, + unsigned int count); +asmlinkage long compat_sys_getdents64(unsigned int fd, + struct linux_dirent64 __user * dirent, + unsigned int count); +asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *, + unsigned int nr_segs, unsigned int flags); +asmlinkage long compat_sys_open(const char __user *filename, int flags, + int mode); asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode); +asmlinkage long compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, + int flags); +asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, + compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, + void __user *sig); +asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, + unsigned int nfds, + struct compat_timespec __user *tsp, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize); +#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) +union compat_nfsctl_res; +struct compat_nfsctl_arg; +asmlinkage long compat_sys_nfsservctl(int cmd, + struct compat_nfsctl_arg __user *arg, + union compat_nfsctl_res __user *res); +#else +long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2); +#endif +asmlinkage long compat_sys_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags); +asmlinkage long compat_sys_get_mempolicy(int __user *policy, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, + compat_ulong_t addr, + compat_ulong_t flags); +asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode); +asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, + compat_ulong_t mode, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, compat_ulong_t flags); + +asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, unsigned int optlen); +asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, + unsigned flags); +asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, + unsigned int flags); +asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, + unsigned flags); +asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, + unsigned flags, struct sockaddr __user *addr, + int __user *addrlen); +asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags, + struct compat_timespec __user *timeout); +asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp); +asmlinkage long compat_sys_getitimer(int which, + struct compat_itimerval __user *it); +asmlinkage long compat_sys_setitimer(int which, + struct compat_itimerval __user *in, + struct compat_itimerval __user *out); +asmlinkage long compat_sys_times(struct compat_tms __user *tbuf); +asmlinkage long compat_sys_setrlimit(unsigned int resource, + struct compat_rlimit __user *rlim); +asmlinkage long compat_sys_getrlimit (unsigned int resource, + struct compat_rlimit __user *rlim); +asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru); +asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr); +asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr); +asmlinkage long compat_sys_timer_create(clockid_t which_clock, + struct compat_sigevent __user *timer_event_spec, + timer_t __user *created_timer_id); +asmlinkage long compat_sys_timer_settime(timer_t timer_id, int flags, + struct compat_itimerspec __user *new, + struct compat_itimerspec __user *old); +asmlinkage long compat_sys_timer_gettime(timer_t timer_id, + struct compat_itimerspec __user *setting); +asmlinkage long compat_sys_clock_settime(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_gettime(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *tp); +asmlinkage long compat_sys_clock_getres(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, + struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp); +asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct compat_timespec __user *uts, compat_size_t sigsetsize); +asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, + compat_size_t sigsetsize); +asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); +asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + u32 val3); +asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen); +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *, + unsigned long flags); +asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, + const struct compat_mq_attr __user *u_mqstat, + struct compat_mq_attr __user *u_omqstat); +asmlinkage long compat_sys_mq_notify(mqd_t mqdes, + const struct compat_sigevent __user *u_notification); +asmlinkage long compat_sys_mq_open(const char __user *u_name, + int oflag, compat_mode_t mode, + struct compat_mq_attr __user *u_attr); +asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, + const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct compat_timespec __user *u_abs_timeout); +asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, + char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct compat_timespec __user *u_abs_timeout); +asmlinkage long compat_sys_socketcall(int call, u32 __user *args); +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args); extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe..80c28562f57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) return compat_jiffies_to_clock_t(jiffies); } +#ifdef __ARCH_WANT_SYS_SIGPENDING + /* * Assumption: old_sigset_t and compat_old_sigset_t are both * types that can be passed to put_user()/get_user(). @@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) return ret; } +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK + asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, compat_old_sigset_t __user *oset) { @@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } +#endif + asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { -- cgit v1.2.3-70-g09d2 From d410fa4ef99112386de5f218dd7df7b4fca910b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 19 May 2011 15:59:38 -0700 Subject: Create Documentation/security/, move LSM-, credentials-, and keys-related files from Documentation/ to Documentation/security/, add Documentation/security/00-INDEX, and update all occurrences of Documentation/ to Documentation/security/. --- Documentation/00-INDEX | 6 +- Documentation/SELinux.txt | 27 - Documentation/Smack.txt | 541 --------- Documentation/apparmor.txt | 39 - Documentation/credentials.txt | 581 ---------- Documentation/filesystems/nfs/idmapper.txt | 4 +- Documentation/keys-request-key.txt | 202 ---- Documentation/keys-trusted-encrypted.txt | 145 --- Documentation/keys.txt | 1290 --------------------- Documentation/networking/dns_resolver.txt | 4 +- Documentation/security/00-INDEX | 18 + Documentation/security/SELinux.txt | 27 + Documentation/security/Smack.txt | 541 +++++++++ Documentation/security/apparmor.txt | 39 + Documentation/security/credentials.txt | 581 ++++++++++ Documentation/security/keys-request-key.txt | 202 ++++ Documentation/security/keys-trusted-encrypted.txt | 145 +++ Documentation/security/keys.txt | 1290 +++++++++++++++++++++ Documentation/security/tomoyo.txt | 55 + Documentation/tomoyo.txt | 55 - MAINTAINERS | 6 +- include/linux/cred.h | 2 +- include/linux/key.h | 2 +- kernel/cred.c | 2 +- scripts/selinux/README | 2 +- security/apparmor/match.c | 2 +- security/apparmor/policy_unpack.c | 4 +- security/keys/encrypted.c | 2 +- security/keys/request_key.c | 2 +- security/keys/request_key_auth.c | 2 +- security/keys/trusted.c | 2 +- 31 files changed, 2918 insertions(+), 2902 deletions(-) delete mode 100644 Documentation/SELinux.txt delete mode 100644 Documentation/Smack.txt delete mode 100644 Documentation/apparmor.txt delete mode 100644 Documentation/credentials.txt delete mode 100644 Documentation/keys-request-key.txt delete mode 100644 Documentation/keys-trusted-encrypted.txt delete mode 100644 Documentation/keys.txt create mode 100644 Documentation/security/00-INDEX create mode 100644 Documentation/security/SELinux.txt create mode 100644 Documentation/security/Smack.txt create mode 100644 Documentation/security/apparmor.txt create mode 100644 Documentation/security/credentials.txt create mode 100644 Documentation/security/keys-request-key.txt create mode 100644 Documentation/security/keys-trusted-encrypted.txt create mode 100644 Documentation/security/keys.txt create mode 100644 Documentation/security/tomoyo.txt delete mode 100644 Documentation/tomoyo.txt (limited to 'kernel') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index c17cd4bb229..c8c1cf631b3 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -192,10 +192,6 @@ kernel-docs.txt - listing of various WWW + books that document kernel internals. kernel-parameters.txt - summary listing of command line / boot prompt args for the kernel. -keys-request-key.txt - - description of the kernel key request service. -keys.txt - - description of the kernel key retention service. kobject.txt - info of the kobject infrastructure of the Linux kernel. kprobes.txt @@ -294,6 +290,8 @@ scheduler/ - directory with info on the scheduler. scsi/ - directory with info on Linux scsi support. +security/ + - directory that contains security-related info serial/ - directory with info on the low level serial API. serial-console.txt diff --git a/Documentation/SELinux.txt b/Documentation/SELinux.txt deleted file mode 100644 index 07eae00f331..00000000000 --- a/Documentation/SELinux.txt +++ /dev/null @@ -1,27 +0,0 @@ -If you want to use SELinux, chances are you will want -to use the distro-provided policies, or install the -latest reference policy release from - http://oss.tresys.com/projects/refpolicy - -However, if you want to install a dummy policy for -testing, you can do using 'mdp' provided under -scripts/selinux. Note that this requires the selinux -userspace to be installed - in particular you will -need checkpolicy to compile a kernel, and setfiles and -fixfiles to label the filesystem. - - 1. Compile the kernel with selinux enabled. - 2. Type 'make' to compile mdp. - 3. Make sure that you are not running with - SELinux enabled and a real policy. If - you are, reboot with selinux disabled - before continuing. - 4. Run install_policy.sh: - cd scripts/selinux - sh install_policy.sh - -Step 4 will create a new dummy policy valid for your -kernel, with a single selinux user, role, and type. -It will compile the policy, will set your SELINUXTYPE to -dummy in /etc/selinux/config, install the compiled policy -as 'dummy', and relabel your filesystem. diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt deleted file mode 100644 index e9dab41c0fe..00000000000 --- a/Documentation/Smack.txt +++ /dev/null @@ -1,541 +0,0 @@ - - - "Good for you, you've decided to clean the elevator!" - - The Elevator, from Dark Star - -Smack is the the Simplified Mandatory Access Control Kernel. -Smack is a kernel based implementation of mandatory access -control that includes simplicity in its primary design goals. - -Smack is not the only Mandatory Access Control scheme -available for Linux. Those new to Mandatory Access Control -are encouraged to compare Smack with the other mechanisms -available to determine which is best suited to the problem -at hand. - -Smack consists of three major components: - - The kernel - - A start-up script and a few modified applications - - Configuration data - -The kernel component of Smack is implemented as a Linux -Security Modules (LSM) module. It requires netlabel and -works best with file systems that support extended attributes, -although xattr support is not strictly required. -It is safe to run a Smack kernel under a "vanilla" distribution. -Smack kernels use the CIPSO IP option. Some network -configurations are intolerant of IP options and can impede -access to systems that use them as Smack does. - -The startup script etc-init.d-smack should be installed -in /etc/init.d/smack and should be invoked early in the -start-up process. On Fedora rc5.d/S02smack is recommended. -This script ensures that certain devices have the correct -Smack attributes and loads the Smack configuration if -any is defined. This script invokes two programs that -ensure configuration data is properly formatted. These -programs are /usr/sbin/smackload and /usr/sin/smackcipso. -The system will run just fine without these programs, -but it will be difficult to set access rules properly. - -A version of "ls" that provides a "-M" option to display -Smack labels on long listing is available. - -A hacked version of sshd that allows network logins by users -with specific Smack labels is available. This version does -not work for scp. You must set the /etc/ssh/sshd_config -line: - UsePrivilegeSeparation no - -The format of /etc/smack/usr is: - - username smack - -In keeping with the intent of Smack, configuration data is -minimal and not strictly required. The most important -configuration step is mounting the smackfs pseudo filesystem. - -Add this line to /etc/fstab: - - smackfs /smack smackfs smackfsdef=* 0 0 - -and create the /smack directory for mounting. - -Smack uses extended attributes (xattrs) to store file labels. -The command to set a Smack label on a file is: - - # attr -S -s SMACK64 -V "value" path - -NOTE: Smack labels are limited to 23 characters. The attr command - does not enforce this restriction and can be used to set - invalid Smack labels on files. - -If you don't do anything special all users will get the floor ("_") -label when they log in. If you do want to log in via the hacked ssh -at other labels use the attr command to set the smack value on the -home directory and its contents. - -You can add access rules in /etc/smack/accesses. They take the form: - - subjectlabel objectlabel access - -access is a combination of the letters rwxa which specify the -kind of access permitted a subject with subjectlabel on an -object with objectlabel. If there is no rule no access is allowed. - -A process can see the smack label it is running with by -reading /proc/self/attr/current. A privileged process can -set the process smack by writing there. - -Look for additional programs on http://schaufler-ca.com - -From the Smack Whitepaper: - -The Simplified Mandatory Access Control Kernel - -Casey Schaufler -casey@schaufler-ca.com - -Mandatory Access Control - -Computer systems employ a variety of schemes to constrain how information is -shared among the people and services using the machine. Some of these schemes -allow the program or user to decide what other programs or users are allowed -access to pieces of data. These schemes are called discretionary access -control mechanisms because the access control is specified at the discretion -of the user. Other schemes do not leave the decision regarding what a user or -program can access up to users or programs. These schemes are called mandatory -access control mechanisms because you don't have a choice regarding the users -or programs that have access to pieces of data. - -Bell & LaPadula - -From the middle of the 1980's until the turn of the century Mandatory Access -Control (MAC) was very closely associated with the Bell & LaPadula security -model, a mathematical description of the United States Department of Defense -policy for marking paper documents. MAC in this form enjoyed a following -within the Capital Beltway and Scandinavian supercomputer centers but was -often sited as failing to address general needs. - -Domain Type Enforcement - -Around the turn of the century Domain Type Enforcement (DTE) became popular. -This scheme organizes users, programs, and data into domains that are -protected from each other. This scheme has been widely deployed as a component -of popular Linux distributions. The administrative overhead required to -maintain this scheme and the detailed understanding of the whole system -necessary to provide a secure domain mapping leads to the scheme being -disabled or used in limited ways in the majority of cases. - -Smack - -Smack is a Mandatory Access Control mechanism designed to provide useful MAC -while avoiding the pitfalls of its predecessors. The limitations of Bell & -LaPadula are addressed by providing a scheme whereby access can be controlled -according to the requirements of the system and its purpose rather than those -imposed by an arcane government policy. The complexity of Domain Type -Enforcement and avoided by defining access controls in terms of the access -modes already in use. - -Smack Terminology - -The jargon used to talk about Smack will be familiar to those who have dealt -with other MAC systems and shouldn't be too difficult for the uninitiated to -pick up. There are four terms that are used in a specific way and that are -especially important: - - Subject: A subject is an active entity on the computer system. - On Smack a subject is a task, which is in turn the basic unit - of execution. - - Object: An object is a passive entity on the computer system. - On Smack files of all types, IPC, and tasks can be objects. - - Access: Any attempt by a subject to put information into or get - information from an object is an access. - - Label: Data that identifies the Mandatory Access Control - characteristics of a subject or an object. - -These definitions are consistent with the traditional use in the security -community. There are also some terms from Linux that are likely to crop up: - - Capability: A task that possesses a capability has permission to - violate an aspect of the system security policy, as identified by - the specific capability. A task that possesses one or more - capabilities is a privileged task, whereas a task with no - capabilities is an unprivileged task. - - Privilege: A task that is allowed to violate the system security - policy is said to have privilege. As of this writing a task can - have privilege either by possessing capabilities or by having an - effective user of root. - -Smack Basics - -Smack is an extension to a Linux system. It enforces additional restrictions -on what subjects can access which objects, based on the labels attached to -each of the subject and the object. - -Labels - -Smack labels are ASCII character strings, one to twenty-three characters in -length. Single character labels using special characters, that being anything -other than a letter or digit, are reserved for use by the Smack development -team. Smack labels are unstructured, case sensitive, and the only operation -ever performed on them is comparison for equality. Smack labels cannot -contain unprintable characters, the "/" (slash), the "\" (backslash), the "'" -(quote) and '"' (double-quote) characters. -Smack labels cannot begin with a '-', which is reserved for special options. - -There are some predefined labels: - - _ Pronounced "floor", a single underscore character. - ^ Pronounced "hat", a single circumflex character. - * Pronounced "star", a single asterisk character. - ? Pronounced "huh", a single question mark character. - @ Pronounced "Internet", a single at sign character. - -Every task on a Smack system is assigned a label. System tasks, such as -init(8) and systems daemons, are run with the floor ("_") label. User tasks -are assigned labels according to the specification found in the -/etc/smack/user configuration file. - -Access Rules - -Smack uses the traditional access modes of Linux. These modes are read, -execute, write, and occasionally append. There are a few cases where the -access mode may not be obvious. These include: - - Signals: A signal is a write operation from the subject task to - the object task. - Internet Domain IPC: Transmission of a packet is considered a - write operation from the source task to the destination task. - -Smack restricts access based on the label attached to a subject and the label -attached to the object it is trying to access. The rules enforced are, in -order: - - 1. Any access requested by a task labeled "*" is denied. - 2. A read or execute access requested by a task labeled "^" - is permitted. - 3. A read or execute access requested on an object labeled "_" - is permitted. - 4. Any access requested on an object labeled "*" is permitted. - 5. Any access requested by a task on an object with the same - label is permitted. - 6. Any access requested that is explicitly defined in the loaded - rule set is permitted. - 7. Any other access is denied. - -Smack Access Rules - -With the isolation provided by Smack access separation is simple. There are -many interesting cases where limited access by subjects to objects with -different labels is desired. One example is the familiar spy model of -sensitivity, where a scientist working on a highly classified project would be -able to read documents of lower classifications and anything she writes will -be "born" highly classified. To accommodate such schemes Smack includes a -mechanism for specifying rules allowing access between labels. - -Access Rule Format - -The format of an access rule is: - - subject-label object-label access - -Where subject-label is the Smack label of the task, object-label is the Smack -label of the thing being accessed, and access is a string specifying the sort -of access allowed. The Smack labels are limited to 23 characters. The access -specification is searched for letters that describe access modes: - - a: indicates that append access should be granted. - r: indicates that read access should be granted. - w: indicates that write access should be granted. - x: indicates that execute access should be granted. - -Uppercase values for the specification letters are allowed as well. -Access mode specifications can be in any order. Examples of acceptable rules -are: - - TopSecret Secret rx - Secret Unclass R - Manager Game x - User HR w - New Old rRrRr - Closed Off - - -Examples of unacceptable rules are: - - Top Secret Secret rx - Ace Ace r - Odd spells waxbeans - -Spaces are not allowed in labels. Since a subject always has access to files -with the same label specifying a rule for that case is pointless. Only -valid letters (rwxaRWXA) and the dash ('-') character are allowed in -access specifications. The dash is a placeholder, so "a-r" is the same -as "ar". A lone dash is used to specify that no access should be allowed. - -Applying Access Rules - -The developers of Linux rarely define new sorts of things, usually importing -schemes and concepts from other systems. Most often, the other systems are -variants of Unix. Unix has many endearing properties, but consistency of -access control models is not one of them. Smack strives to treat accesses as -uniformly as is sensible while keeping with the spirit of the underlying -mechanism. - -File system objects including files, directories, named pipes, symbolic links, -and devices require access permissions that closely match those used by mode -bit access. To open a file for reading read access is required on the file. To -search a directory requires execute access. Creating a file with write access -requires both read and write access on the containing directory. Deleting a -file requires read and write access to the file and to the containing -directory. It is possible that a user may be able to see that a file exists -but not any of its attributes by the circumstance of having read access to the -containing directory but not to the differently labeled file. This is an -artifact of the file name being data in the directory, not a part of the file. - -IPC objects, message queues, semaphore sets, and memory segments exist in flat -namespaces and access requests are only required to match the object in -question. - -Process objects reflect tasks on the system and the Smack label used to access -them is the same Smack label that the task would use for its own access -attempts. Sending a signal via the kill() system call is a write operation -from the signaler to the recipient. Debugging a process requires both reading -and writing. Creating a new task is an internal operation that results in two -tasks with identical Smack labels and requires no access checks. - -Sockets are data structures attached to processes and sending a packet from -one process to another requires that the sender have write access to the -receiver. The receiver is not required to have read access to the sender. - -Setting Access Rules - -The configuration file /etc/smack/accesses contains the rules to be set at -system startup. The contents are written to the special file /smack/load. -Rules can be written to /smack/load at any time and take effect immediately. -For any pair of subject and object labels there can be only one rule, with the -most recently specified overriding any earlier specification. - -The program smackload is provided to ensure data is formatted -properly when written to /smack/load. This program reads lines -of the form - - subjectlabel objectlabel mode. - -Task Attribute - -The Smack label of a process can be read from /proc//attr/current. A -process can read its own Smack label from /proc/self/attr/current. A -privileged process can change its own Smack label by writing to -/proc/self/attr/current but not the label of another process. - -File Attribute - -The Smack label of a filesystem object is stored as an extended attribute -named SMACK64 on the file. This attribute is in the security namespace. It can -only be changed by a process with privilege. - -Privilege - -A process with CAP_MAC_OVERRIDE is privileged. - -Smack Networking - -As mentioned before, Smack enforces access control on network protocol -transmissions. Every packet sent by a Smack process is tagged with its Smack -label. This is done by adding a CIPSO tag to the header of the IP packet. Each -packet received is expected to have a CIPSO tag that identifies the label and -if it lacks such a tag the network ambient label is assumed. Before the packet -is delivered a check is made to determine that a subject with the label on the -packet has write access to the receiving process and if that is not the case -the packet is dropped. - -CIPSO Configuration - -It is normally unnecessary to specify the CIPSO configuration. The default -values used by the system handle all internal cases. Smack will compose CIPSO -label values to match the Smack labels being used without administrative -intervention. Unlabeled packets that come into the system will be given the -ambient label. - -Smack requires configuration in the case where packets from a system that is -not smack that speaks CIPSO may be encountered. Usually this will be a Trusted -Solaris system, but there are other, less widely deployed systems out there. -CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level, -and a category set with each packet. The DOI is intended to identify a group -of systems that use compatible labeling schemes, and the DOI specified on the -smack system must match that of the remote system or packets will be -discarded. The DOI is 3 by default. The value can be read from /smack/doi and -can be changed by writing to /smack/doi. - -The label and category set are mapped to a Smack label as defined in -/etc/smack/cipso. - -A Smack/CIPSO mapping has the form: - - smack level [category [category]*] - -Smack does not expect the level or category sets to be related in any -particular way and does not assume or assign accesses based on them. Some -examples of mappings: - - TopSecret 7 - TS:A,B 7 1 2 - SecBDE 5 2 4 6 - RAFTERS 7 12 26 - -The ":" and "," characters are permitted in a Smack label but have no special -meaning. - -The mapping of Smack labels to CIPSO values is defined by writing to -/smack/cipso. Again, the format of data written to this special file -is highly restrictive, so the program smackcipso is provided to -ensure the writes are done properly. This program takes mappings -on the standard input and sends them to /smack/cipso properly. - -In addition to explicit mappings Smack supports direct CIPSO mappings. One -CIPSO level is used to indicate that the category set passed in the packet is -in fact an encoding of the Smack label. The level used is 250 by default. The -value can be read from /smack/direct and changed by writing to /smack/direct. - -Socket Attributes - -There are two attributes that are associated with sockets. These attributes -can only be set by privileged tasks, but any task can read them for their own -sockets. - - SMACK64IPIN: The Smack label of the task object. A privileged - program that will enforce policy may set this to the star label. - - SMACK64IPOUT: The Smack label transmitted with outgoing packets. - A privileged program may set this to match the label of another - task with which it hopes to communicate. - -Smack Netlabel Exceptions - -You will often find that your labeled application has to talk to the outside, -unlabeled world. To do this there's a special file /smack/netlabel where you can -add some exceptions in the form of : -@IP1 LABEL1 or -@IP2/MASK LABEL2 - -It means that your application will have unlabeled access to @IP1 if it has -write access on LABEL1, and access to the subnet @IP2/MASK if it has write -access on LABEL2. - -Entries in the /smack/netlabel file are matched by longest mask first, like in -classless IPv4 routing. - -A special label '@' and an option '-CIPSO' can be used there : -@ means Internet, any application with any label has access to it --CIPSO means standard CIPSO networking - -If you don't know what CIPSO is and don't plan to use it, you can just do : -echo 127.0.0.1 -CIPSO > /smack/netlabel -echo 0.0.0.0/0 @ > /smack/netlabel - -If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled -Internet access, you can have : -echo 127.0.0.1 -CIPSO > /smack/netlabel -echo 192.168.0.0/16 -CIPSO > /smack/netlabel -echo 0.0.0.0/0 @ > /smack/netlabel - - -Writing Applications for Smack - -There are three sorts of applications that will run on a Smack system. How an -application interacts with Smack will determine what it will have to do to -work properly under Smack. - -Smack Ignorant Applications - -By far the majority of applications have no reason whatever to care about the -unique properties of Smack. Since invoking a program has no impact on the -Smack label associated with the process the only concern likely to arise is -whether the process has execute access to the program. - -Smack Relevant Applications - -Some programs can be improved by teaching them about Smack, but do not make -any security decisions themselves. The utility ls(1) is one example of such a -program. - -Smack Enforcing Applications - -These are special programs that not only know about Smack, but participate in -the enforcement of system policy. In most cases these are the programs that -set up user sessions. There are also network services that provide information -to processes running with various labels. - -File System Interfaces - -Smack maintains labels on file system objects using extended attributes. The -Smack label of a file, directory, or other file system object can be obtained -using getxattr(2). - - len = getxattr("/", "security.SMACK64", value, sizeof (value)); - -will put the Smack label of the root directory into value. A privileged -process can set the Smack label of a file system object with setxattr(2). - - len = strlen("Rubble"); - rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0); - -will set the Smack label of /foo to "Rubble" if the program has appropriate -privilege. - -Socket Interfaces - -The socket attributes can be read using fgetxattr(2). - -A privileged process can set the Smack label of outgoing packets with -fsetxattr(2). - - len = strlen("Rubble"); - rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0); - -will set the Smack label "Rubble" on packets going out from the socket if the -program has appropriate privilege. - - rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0); - -will set the Smack label "*" as the object label against which incoming -packets will be checked if the program has appropriate privilege. - -Administration - -Smack supports some mount options: - - smackfsdef=label: specifies the label to give files that lack - the Smack label extended attribute. - - smackfsroot=label: specifies the label to assign the root of the - file system if it lacks the Smack extended attribute. - - smackfshat=label: specifies a label that must have read access to - all labels set on the filesystem. Not yet enforced. - - smackfsfloor=label: specifies a label to which all labels set on the - filesystem must have read access. Not yet enforced. - -These mount options apply to all file system types. - -Smack auditing - -If you want Smack auditing of security events, you need to set CONFIG_AUDIT -in your kernel configuration. -By default, all denied events will be audited. You can change this behavior by -writing a single character to the /smack/logging file : -0 : no logging -1 : log denied (default) -2 : log accepted -3 : log denied & accepted - -Events are logged as 'key=value' pairs, for each event you at least will get -the subjet, the object, the rights requested, the action, the kernel function -that triggered the event, plus other pairs depending on the type of event -audited. diff --git a/Documentation/apparmor.txt b/Documentation/apparmor.txt deleted file mode 100644 index 93c1fd7d063..00000000000 --- a/Documentation/apparmor.txt +++ /dev/null @@ -1,39 +0,0 @@ ---- What is AppArmor? --- - -AppArmor is MAC style security extension for the Linux kernel. It implements -a task centered policy, with task "profiles" being created and loaded -from user space. Tasks on the system that do not have a profile defined for -them run in an unconfined state which is equivalent to standard Linux DAC -permissions. - ---- How to enable/disable --- - -set CONFIG_SECURITY_APPARMOR=y - -If AppArmor should be selected as the default security module then - set CONFIG_DEFAULT_SECURITY="apparmor" - and CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 - -Build the kernel - -If AppArmor is not the default security module it can be enabled by passing -security=apparmor on the kernel's command line. - -If AppArmor is the default security module it can be disabled by passing -apparmor=0, security=XXXX (where XXX is valid security module), on the -kernel's command line - -For AppArmor to enforce any restrictions beyond standard Linux DAC permissions -policy must be loaded into the kernel from user space (see the Documentation -and tools links). - ---- Documentation --- - -Documentation can be found on the wiki. - ---- Links --- - -Mailing List - apparmor@lists.ubuntu.com -Wiki - http://apparmor.wiki.kernel.org/ -User space tools - https://launchpad.net/apparmor -Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt deleted file mode 100644 index 995baf379c0..00000000000 --- a/Documentation/credentials.txt +++ /dev/null @@ -1,581 +0,0 @@ - ==================== - CREDENTIALS IN LINUX - ==================== - -By: David Howells - -Contents: - - (*) Overview. - - (*) Types of credentials. - - (*) File markings. - - (*) Task credentials. - - - Immutable credentials. - - Accessing task credentials. - - Accessing another task's credentials. - - Altering credentials. - - Managing credentials. - - (*) Open file credentials. - - (*) Overriding the VFS's use of credentials. - - -======== -OVERVIEW -======== - -There are several parts to the security check performed by Linux when one -object acts upon another: - - (1) Objects. - - Objects are things in the system that may be acted upon directly by - userspace programs. Linux has a variety of actionable objects, including: - - - Tasks - - Files/inodes - - Sockets - - Message queues - - Shared memory segments - - Semaphores - - Keys - - As a part of the description of all these objects there is a set of - credentials. What's in the set depends on the type of object. - - (2) Object ownership. - - Amongst the credentials of most objects, there will be a subset that - indicates the ownership of that object. This is used for resource - accounting and limitation (disk quotas and task rlimits for example). - - In a standard UNIX filesystem, for instance, this will be defined by the - UID marked on the inode. - - (3) The objective context. - - Also amongst the credentials of those objects, there will be a subset that - indicates the 'objective context' of that object. This may or may not be - the same set as in (2) - in standard UNIX files, for instance, this is the - defined by the UID and the GID marked on the inode. - - The objective context is used as part of the security calculation that is - carried out when an object is acted upon. - - (4) Subjects. - - A subject is an object that is acting upon another object. - - Most of the objects in the system are inactive: they don't act on other - objects within the system. Processes/tasks are the obvious exception: - they do stuff; they access and manipulate things. - - Objects other than tasks may under some circumstances also be subjects. - For instance an open file may send SIGIO to a task using the UID and EUID - given to it by a task that called fcntl(F_SETOWN) upon it. In this case, - the file struct will have a subjective context too. - - (5) The subjective context. - - A subject has an additional interpretation of its credentials. A subset - of its credentials forms the 'subjective context'. The subjective context - is used as part of the security calculation that is carried out when a - subject acts. - - A Linux task, for example, has the FSUID, FSGID and the supplementary - group list for when it is acting upon a file - which are quite separate - from the real UID and GID that normally form the objective context of the - task. - - (6) Actions. - - Linux has a number of actions available that a subject may perform upon an - object. The set of actions available depends on the nature of the subject - and the object. - - Actions include reading, writing, creating and deleting files; forking or - signalling and tracing tasks. - - (7) Rules, access control lists and security calculations. - - When a subject acts upon an object, a security calculation is made. This - involves taking the subjective context, the objective context and the - action, and searching one or more sets of rules to see whether the subject - is granted or denied permission to act in the desired manner on the - object, given those contexts. - - There are two main sources of rules: - - (a) Discretionary access control (DAC): - - Sometimes the object will include sets of rules as part of its - description. This is an 'Access Control List' or 'ACL'. A Linux - file may supply more than one ACL. - - A traditional UNIX file, for example, includes a permissions mask that - is an abbreviated ACL with three fixed classes of subject ('user', - 'group' and 'other'), each of which may be granted certain privileges - ('read', 'write' and 'execute' - whatever those map to for the object - in question). UNIX file permissions do not allow the arbitrary - specification of subjects, however, and so are of limited use. - - A Linux file might also sport a POSIX ACL. This is a list of rules - that grants various permissions to arbitrary subjects. - - (b) Mandatory access control (MAC): - - The system as a whole may have one or more sets of rules that get - applied to all subjects and objects, regardless of their source. - SELinux and Smack are examples of this. - - In the case of SELinux and Smack, each object is given a label as part - of its credentials. When an action is requested, they take the - subject label, the object label and the action and look for a rule - that says that this action is either granted or denied. - - -==================== -TYPES OF CREDENTIALS -==================== - -The Linux kernel supports the following types of credentials: - - (1) Traditional UNIX credentials. - - Real User ID - Real Group ID - - The UID and GID are carried by most, if not all, Linux objects, even if in - some cases it has to be invented (FAT or CIFS files for example, which are - derived from Windows). These (mostly) define the objective context of - that object, with tasks being slightly different in some cases. - - Effective, Saved and FS User ID - Effective, Saved and FS Group ID - Supplementary groups - - These are additional credentials used by tasks only. Usually, an - EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID - will be used as the objective. For tasks, it should be noted that this is - not always true. - - (2) Capabilities. - - Set of permitted capabilities - Set of inheritable capabilities - Set of effective capabilities - Capability bounding set - - These are only carried by tasks. They indicate superior capabilities - granted piecemeal to a task that an ordinary task wouldn't otherwise have. - These are manipulated implicitly by changes to the traditional UNIX - credentials, but can also be manipulated directly by the capset() system - call. - - The permitted capabilities are those caps that the process might grant - itself to its effective or permitted sets through capset(). This - inheritable set might also be so constrained. - - The effective capabilities are the ones that a task is actually allowed to - make use of itself. - - The inheritable capabilities are the ones that may get passed across - execve(). - - The bounding set limits the capabilities that may be inherited across - execve(), especially when a binary is executed that will execute as UID 0. - - (3) Secure management flags (securebits). - - These are only carried by tasks. These govern the way the above - credentials are manipulated and inherited over certain operations such as - execve(). They aren't used directly as objective or subjective - credentials. - - (4) Keys and keyrings. - - These are only carried by tasks. They carry and cache security tokens - that don't fit into the other standard UNIX credentials. They are for - making such things as network filesystem keys available to the file - accesses performed by processes, without the necessity of ordinary - programs having to know about security details involved. - - Keyrings are a special type of key. They carry sets of other keys and can - be searched for the desired key. Each process may subscribe to a number - of keyrings: - - Per-thread keying - Per-process keyring - Per-session keyring - - When a process accesses a key, if not already present, it will normally be - cached on one of these keyrings for future accesses to find. - - For more information on using keys, see Documentation/keys.txt. - - (5) LSM - - The Linux Security Module allows extra controls to be placed over the - operations that a task may do. Currently Linux supports two main - alternate LSM options: SELinux and Smack. - - Both work by labelling the objects in a system and then applying sets of - rules (policies) that say what operations a task with one label may do to - an object with another label. - - (6) AF_KEY - - This is a socket-based approach to credential management for networking - stacks [RFC 2367]. It isn't discussed by this document as it doesn't - interact directly with task and file credentials; rather it keeps system - level credentials. - - -When a file is opened, part of the opening task's subjective context is -recorded in the file struct created. This allows operations using that file -struct to use those credentials instead of the subjective context of the task -that issued the operation. An example of this would be a file opened on a -network filesystem where the credentials of the opened file should be presented -to the server, regardless of who is actually doing a read or a write upon it. - - -============= -FILE MARKINGS -============= - -Files on disk or obtained over the network may have annotations that form the -objective security context of that file. Depending on the type of filesystem, -this may include one or more of the following: - - (*) UNIX UID, GID, mode; - - (*) Windows user ID; - - (*) Access control list; - - (*) LSM security label; - - (*) UNIX exec privilege escalation bits (SUID/SGID); - - (*) File capabilities exec privilege escalation bits. - -These are compared to the task's subjective security context, and certain -operations allowed or disallowed as a result. In the case of execve(), the -privilege escalation bits come into play, and may allow the resulting process -extra privileges, based on the annotations on the executable file. - - -================ -TASK CREDENTIALS -================ - -In Linux, all of a task's credentials are held in (uid, gid) or through -(groups, keys, LSM security) a refcounted structure of type 'struct cred'. -Each task points to its credentials by a pointer called 'cred' in its -task_struct. - -Once a set of credentials has been prepared and committed, it may not be -changed, barring the following exceptions: - - (1) its reference count may be changed; - - (2) the reference count on the group_info struct it points to may be changed; - - (3) the reference count on the security data it points to may be changed; - - (4) the reference count on any keyrings it points to may be changed; - - (5) any keyrings it points to may be revoked, expired or have their security - attributes changed; and - - (6) the contents of any keyrings to which it points may be changed (the whole - point of keyrings being a shared set of credentials, modifiable by anyone - with appropriate access). - -To alter anything in the cred struct, the copy-and-replace principle must be -adhered to. First take a copy, then alter the copy and then use RCU to change -the task pointer to make it point to the new copy. There are wrappers to aid -with this (see below). - -A task may only alter its _own_ credentials; it is no longer permitted for a -task to alter another's credentials. This means the capset() system call is no -longer permitted to take any PID other than the one of the current process. -Also keyctl_instantiate() and keyctl_negate() functions no longer permit -attachment to process-specific keyrings in the requesting process as the -instantiating process may need to create them. - - -IMMUTABLE CREDENTIALS ---------------------- - -Once a set of credentials has been made public (by calling commit_creds() for -example), it must be considered immutable, barring two exceptions: - - (1) The reference count may be altered. - - (2) Whilst the keyring subscriptions of a set of credentials may not be - changed, the keyrings subscribed to may have their contents altered. - -To catch accidental credential alteration at compile time, struct task_struct -has _const_ pointers to its credential sets, as does struct file. Furthermore, -certain functions such as get_cred() and put_cred() operate on const pointers, -thus rendering casts unnecessary, but require to temporarily ditch the const -qualification to be able to alter the reference count. - - -ACCESSING TASK CREDENTIALS --------------------------- - -A task being able to alter only its own credentials permits the current process -to read or replace its own credentials without the need for any form of locking -- which simplifies things greatly. It can just call: - - const struct cred *current_cred() - -to get a pointer to its credentials structure, and it doesn't have to release -it afterwards. - -There are convenience wrappers for retrieving specific aspects of a task's -credentials (the value is simply returned in each case): - - uid_t current_uid(void) Current's real UID - gid_t current_gid(void) Current's real GID - uid_t current_euid(void) Current's effective UID - gid_t current_egid(void) Current's effective GID - uid_t current_fsuid(void) Current's file access UID - gid_t current_fsgid(void) Current's file access GID - kernel_cap_t current_cap(void) Current's effective capabilities - void *current_security(void) Current's LSM security pointer - struct user_struct *current_user(void) Current's user account - -There are also convenience wrappers for retrieving specific associated pairs of -a task's credentials: - - void current_uid_gid(uid_t *, gid_t *); - void current_euid_egid(uid_t *, gid_t *); - void current_fsuid_fsgid(uid_t *, gid_t *); - -which return these pairs of values through their arguments after retrieving -them from the current task's credentials. - - -In addition, there is a function for obtaining a reference on the current -process's current set of credentials: - - const struct cred *get_current_cred(void); - -and functions for getting references to one of the credentials that don't -actually live in struct cred: - - struct user_struct *get_current_user(void); - struct group_info *get_current_groups(void); - -which get references to the current process's user accounting structure and -supplementary groups list respectively. - -Once a reference has been obtained, it must be released with put_cred(), -free_uid() or put_group_info() as appropriate. - - -ACCESSING ANOTHER TASK'S CREDENTIALS ------------------------------------- - -Whilst a task may access its own credentials without the need for locking, the -same is not true of a task wanting to access another task's credentials. It -must use the RCU read lock and rcu_dereference(). - -The rcu_dereference() is wrapped by: - - const struct cred *__task_cred(struct task_struct *task); - -This should be used inside the RCU read lock, as in the following example: - - void foo(struct task_struct *t, struct foo_data *f) - { - const struct cred *tcred; - ... - rcu_read_lock(); - tcred = __task_cred(t); - f->uid = tcred->uid; - f->gid = tcred->gid; - f->groups = get_group_info(tcred->groups); - rcu_read_unlock(); - ... - } - -Should it be necessary to hold another task's credentials for a long period of -time, and possibly to sleep whilst doing so, then the caller should get a -reference on them using: - - const struct cred *get_task_cred(struct task_struct *task); - -This does all the RCU magic inside of it. The caller must call put_cred() on -the credentials so obtained when they're finished with. - - [*] Note: The result of __task_cred() should not be passed directly to - get_cred() as this may race with commit_cred(). - -There are a couple of convenience functions to access bits of another task's -credentials, hiding the RCU magic from the caller: - - uid_t task_uid(task) Task's real UID - uid_t task_euid(task) Task's effective UID - -If the caller is holding the RCU read lock at the time anyway, then: - - __task_cred(task)->uid - __task_cred(task)->euid - -should be used instead. Similarly, if multiple aspects of a task's credentials -need to be accessed, RCU read lock should be used, __task_cred() called, the -result stored in a temporary pointer and then the credential aspects called -from that before dropping the lock. This prevents the potentially expensive -RCU magic from being invoked multiple times. - -Should some other single aspect of another task's credentials need to be -accessed, then this can be used: - - task_cred_xxx(task, member) - -where 'member' is a non-pointer member of the cred struct. For instance: - - uid_t task_cred_xxx(task, suid); - -will retrieve 'struct cred::suid' from the task, doing the appropriate RCU -magic. This may not be used for pointer members as what they point to may -disappear the moment the RCU read lock is dropped. - - -ALTERING CREDENTIALS --------------------- - -As previously mentioned, a task may only alter its own credentials, and may not -alter those of another task. This means that it doesn't need to use any -locking to alter its own credentials. - -To alter the current process's credentials, a function should first prepare a -new set of credentials by calling: - - struct cred *prepare_creds(void); - -this locks current->cred_replace_mutex and then allocates and constructs a -duplicate of the current process's credentials, returning with the mutex still -held if successful. It returns NULL if not successful (out of memory). - -The mutex prevents ptrace() from altering the ptrace state of a process whilst -security checks on credentials construction and changing is taking place as -the ptrace state may alter the outcome, particularly in the case of execve(). - -The new credentials set should be altered appropriately, and any security -checks and hooks done. Both the current and the proposed sets of credentials -are available for this purpose as current_cred() will return the current set -still at this point. - - -When the credential set is ready, it should be committed to the current process -by calling: - - int commit_creds(struct cred *new); - -This will alter various aspects of the credentials and the process, giving the -LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually -commit the new credentials to current->cred, it will release -current->cred_replace_mutex to allow ptrace() to take place, and it will notify -the scheduler and others of the changes. - -This function is guaranteed to return 0, so that it can be tail-called at the -end of such functions as sys_setresuid(). - -Note that this function consumes the caller's reference to the new credentials. -The caller should _not_ call put_cred() on the new credentials afterwards. - -Furthermore, once this function has been called on a new set of credentials, -those credentials may _not_ be changed further. - - -Should the security checks fail or some other error occur after prepare_creds() -has been called, then the following function should be invoked: - - void abort_creds(struct cred *new); - -This releases the lock on current->cred_replace_mutex that prepare_creds() got -and then releases the new credentials. - - -A typical credentials alteration function would look something like this: - - int alter_suid(uid_t suid) - { - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - new->suid = suid; - ret = security_alter_suid(new); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); - } - - -MANAGING CREDENTIALS --------------------- - -There are some functions to help manage credentials: - - (*) void put_cred(const struct cred *cred); - - This releases a reference to the given set of credentials. If the - reference count reaches zero, the credentials will be scheduled for - destruction by the RCU system. - - (*) const struct cred *get_cred(const struct cred *cred); - - This gets a reference on a live set of credentials, returning a pointer to - that set of credentials. - - (*) struct cred *get_new_cred(struct cred *cred); - - This gets a reference on a set of credentials that is under construction - and is thus still mutable, returning a pointer to that set of credentials. - - -===================== -OPEN FILE CREDENTIALS -===================== - -When a new file is opened, a reference is obtained on the opening task's -credentials and this is attached to the file struct as 'f_cred' in place of -'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid -should now access file->f_cred->fsuid and file->f_cred->fsgid. - -It is safe to access f_cred without the use of RCU or locking because the -pointer will not change over the lifetime of the file struct, and nor will the -contents of the cred struct pointed to, barring the exceptions listed above -(see the Task Credentials section). - - -======================================= -OVERRIDING THE VFS'S USE OF CREDENTIALS -======================================= - -Under some circumstances it is desirable to override the credentials used by -the VFS, and that can be done by calling into such as vfs_mkdir() with a -different set of credentials. This is done in the following places: - - (*) sys_faccessat(). - - (*) do_coredump(). - - (*) nfs4recover.c. diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt index b9b4192ea8b..9c8fd614865 100644 --- a/Documentation/filesystems/nfs/idmapper.txt +++ b/Documentation/filesystems/nfs/idmapper.txt @@ -47,8 +47,8 @@ request-key will find the first matching line and corresponding program. In this case, /some/other/program will handle all uid lookups and /usr/sbin/nfs.idmap will handle gid, user, and group lookups. -See for more information about the -request-key function. +See for more information +about the request-key function. ========= diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt deleted file mode 100644 index 69686ad12c6..00000000000 --- a/Documentation/keys-request-key.txt +++ /dev/null @@ -1,202 +0,0 @@ - =================== - KEY REQUEST SERVICE - =================== - -The key request service is part of the key retention service (refer to -Documentation/keys.txt). This document explains more fully how the requesting -algorithm works. - -The process starts by either the kernel requesting a service by calling -request_key*(): - - struct key *request_key(const struct key_type *type, - const char *description, - const char *callout_info); - -or: - - struct key *request_key_with_auxdata(const struct key_type *type, - const char *description, - const char *callout_info, - size_t callout_len, - void *aux); - -or: - - struct key *request_key_async(const struct key_type *type, - const char *description, - const char *callout_info, - size_t callout_len); - -or: - - struct key *request_key_async_with_auxdata(const struct key_type *type, - const char *description, - const char *callout_info, - size_t callout_len, - void *aux); - -Or by userspace invoking the request_key system call: - - key_serial_t request_key(const char *type, - const char *description, - const char *callout_info, - key_serial_t dest_keyring); - -The main difference between the access points is that the in-kernel interface -does not need to link the key to a keyring to prevent it from being immediately -destroyed. The kernel interface returns a pointer directly to the key, and -it's up to the caller to destroy the key. - -The request_key*_with_auxdata() calls are like the in-kernel request_key*() -calls, except that they permit auxiliary data to be passed to the upcaller (the -default is NULL). This is only useful for those key types that define their -own upcall mechanism rather than using /sbin/request-key. - -The two async in-kernel calls may return keys that are still in the process of -being constructed. The two non-async ones will wait for construction to -complete first. - -The userspace interface links the key to a keyring associated with the process -to prevent the key from going away, and returns the serial number of the key to -the caller. - - -The following example assumes that the key types involved don't define their -own upcall mechanisms. If they do, then those should be substituted for the -forking and execution of /sbin/request-key. - - -=========== -THE PROCESS -=========== - -A request proceeds in the following manner: - - (1) Process A calls request_key() [the userspace syscall calls the kernel - interface]. - - (2) request_key() searches the process's subscribed keyrings to see if there's - a suitable key there. If there is, it returns the key. If there isn't, - and callout_info is not set, an error is returned. Otherwise the process - proceeds to the next step. - - (3) request_key() sees that A doesn't have the desired key yet, so it creates - two things: - - (a) An uninstantiated key U of requested type and description. - - (b) An authorisation key V that refers to key U and notes that process A - is the context in which key U should be instantiated and secured, and - from which associated key requests may be satisfied. - - (4) request_key() then forks and executes /sbin/request-key with a new session - keyring that contains a link to auth key V. - - (5) /sbin/request-key assumes the authority associated with key U. - - (6) /sbin/request-key execs an appropriate program to perform the actual - instantiation. - - (7) The program may want to access another key from A's context (say a - Kerberos TGT key). It just requests the appropriate key, and the keyring - search notes that the session keyring has auth key V in its bottom level. - - This will permit it to then search the keyrings of process A with the - UID, GID, groups and security info of process A as if it was process A, - and come up with key W. - - (8) The program then does what it must to get the data with which to - instantiate key U, using key W as a reference (perhaps it contacts a - Kerberos server using the TGT) and then instantiates key U. - - (9) Upon instantiating key U, auth key V is automatically revoked so that it - may not be used again. - -(10) The program then exits 0 and request_key() deletes key V and returns key - U to the caller. - -This also extends further. If key W (step 7 above) didn't exist, key W would -be created uninstantiated, another auth key (X) would be created (as per step -3) and another copy of /sbin/request-key spawned (as per step 4); but the -context specified by auth key X will still be process A, as it was in auth key -V. - -This is because process A's keyrings can't simply be attached to -/sbin/request-key at the appropriate places because (a) execve will discard two -of them, and (b) it requires the same UID/GID/Groups all the way through. - - -==================================== -NEGATIVE INSTANTIATION AND REJECTION -==================================== - -Rather than instantiating a key, it is possible for the possessor of an -authorisation key to negatively instantiate a key that's under construction. -This is a short duration placeholder that causes any attempt at re-requesting -the key whilst it exists to fail with error ENOKEY if negated or the specified -error if rejected. - -This is provided to prevent excessive repeated spawning of /sbin/request-key -processes for a key that will never be obtainable. - -Should the /sbin/request-key process exit anything other than 0 or die on a -signal, the key under construction will be automatically negatively -instantiated for a short amount of time. - - -==================== -THE SEARCH ALGORITHM -==================== - -A search of any particular keyring proceeds in the following fashion: - - (1) When the key management code searches for a key (keyring_search_aux) it - firstly calls key_permission(SEARCH) on the keyring it's starting with, - if this denies permission, it doesn't search further. - - (2) It considers all the non-keyring keys within that keyring and, if any key - matches the criteria specified, calls key_permission(SEARCH) on it to see - if the key is allowed to be found. If it is, that key is returned; if - not, the search continues, and the error code is retained if of higher - priority than the one currently set. - - (3) It then considers all the keyring-type keys in the keyring it's currently - searching. It calls key_permission(SEARCH) on each keyring, and if this - grants permission, it recurses, executing steps (2) and (3) on that - keyring. - -The process stops immediately a valid key is found with permission granted to -use it. Any error from a previous match attempt is discarded and the key is -returned. - -When search_process_keyrings() is invoked, it performs the following searches -until one succeeds: - - (1) If extant, the process's thread keyring is searched. - - (2) If extant, the process's process keyring is searched. - - (3) The process's session keyring is searched. - - (4) If the process has assumed the authority associated with a request_key() - authorisation key then: - - (a) If extant, the calling process's thread keyring is searched. - - (b) If extant, the calling process's process keyring is searched. - - (c) The calling process's session keyring is searched. - -The moment one succeeds, all pending errors are discarded and the found key is -returned. - -Only if all these fail does the whole thing fail with the highest priority -error. Note that several errors may have come from LSM. - -The error priority is: - - EKEYREVOKED > EKEYEXPIRED > ENOKEY - -EACCES/EPERM are only returned on a direct search of a specific keyring where -the basal keyring does not grant Search permission. diff --git a/Documentation/keys-trusted-encrypted.txt b/Documentation/keys-trusted-encrypted.txt deleted file mode 100644 index 8fb79bc1ac4..00000000000 --- a/Documentation/keys-trusted-encrypted.txt +++ /dev/null @@ -1,145 +0,0 @@ - Trusted and Encrypted Keys - -Trusted and Encrypted Keys are two new key types added to the existing kernel -key ring service. Both of these new types are variable length symmetic keys, -and in both cases all keys are created in the kernel, and user space sees, -stores, and loads only encrypted blobs. Trusted Keys require the availability -of a Trusted Platform Module (TPM) chip for greater security, while Encrypted -Keys can be used on any system. All user level blobs, are displayed and loaded -in hex ascii for convenience, and are integrity verified. - -Trusted Keys use a TPM both to generate and to seal the keys. Keys are sealed -under a 2048 bit RSA key in the TPM, and optionally sealed to specified PCR -(integrity measurement) values, and only unsealed by the TPM, if PCRs and blob -integrity verifications match. A loaded Trusted Key can be updated with new -(future) PCR values, so keys are easily migrated to new pcr values, such as -when the kernel and initramfs are updated. The same key can have many saved -blobs under different PCR values, so multiple boots are easily supported. - -By default, trusted keys are sealed under the SRK, which has the default -authorization value (20 zeros). This can be set at takeownership time with the -trouser's utility: "tpm_takeownership -u -z". - -Usage: - keyctl add trusted name "new keylen [options]" ring - keyctl add trusted name "load hex_blob [pcrlock=pcrnum]" ring - keyctl update key "update [options]" - keyctl print keyid - - options: - keyhandle= ascii hex value of sealing key default 0x40000000 (SRK) - keyauth= ascii hex auth for sealing key default 0x00...i - (40 ascii zeros) - blobauth= ascii hex auth for sealed data default 0x00... - (40 ascii zeros) - blobauth= ascii hex auth for sealed data default 0x00... - (40 ascii zeros) - pcrinfo= ascii hex of PCR_INFO or PCR_INFO_LONG (no default) - pcrlock= pcr number to be extended to "lock" blob - migratable= 0|1 indicating permission to reseal to new PCR values, - default 1 (resealing allowed) - -"keyctl print" returns an ascii hex copy of the sealed key, which is in standard -TPM_STORED_DATA format. The key length for new keys are always in bytes. -Trusted Keys can be 32 - 128 bytes (256 - 1024 bits), the upper limit is to fit -within the 2048 bit SRK (RSA) keylength, with all necessary structure/padding. - -Encrypted keys do not depend on a TPM, and are faster, as they use AES for -encryption/decryption. New keys are created from kernel generated random -numbers, and are encrypted/decrypted using a specified 'master' key. The -'master' key can either be a trusted-key or user-key type. The main -disadvantage of encrypted keys is that if they are not rooted in a trusted key, -they are only as secure as the user key encrypting them. The master user key -should therefore be loaded in as secure a way as possible, preferably early in -boot. - -Usage: - keyctl add encrypted name "new key-type:master-key-name keylen" ring - keyctl add encrypted name "load hex_blob" ring - keyctl update keyid "update key-type:master-key-name" - -where 'key-type' is either 'trusted' or 'user'. - -Examples of trusted and encrypted key usage: - -Create and save a trusted key named "kmk" of length 32 bytes: - - $ keyctl add trusted kmk "new 32" @u - 440502848 - - $ keyctl show - Session Keyring - -3 --alswrv 500 500 keyring: _ses - 97833714 --alswrv 500 -1 \_ keyring: _uid.500 - 440502848 --alswrv 500 500 \_ trusted: kmk - - $ keyctl print 440502848 - 0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915 - 3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b - 27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722 - a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec - d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d - dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0 - f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b - e4a8aea2b607ec96931e6f4d4fe563ba - - $ keyctl pipe 440502848 > kmk.blob - -Load a trusted key from the saved blob: - - $ keyctl add trusted kmk "load `cat kmk.blob`" @u - 268728824 - - $ keyctl print 268728824 - 0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915 - 3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b - 27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722 - a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec - d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d - dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0 - f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b - e4a8aea2b607ec96931e6f4d4fe563ba - -Reseal a trusted key under new pcr values: - - $ keyctl update 268728824 "update pcrinfo=`cat pcr.blob`" - $ keyctl print 268728824 - 010100000000002c0002800093c35a09b70fff26e7a98ae786c641e678ec6ffb6b46d805 - 77c8a6377aed9d3219c6dfec4b23ffe3000001005d37d472ac8a44023fbb3d18583a4f73 - d3a076c0858f6f1dcaa39ea0f119911ff03f5406df4f7f27f41da8d7194f45c9f4e00f2e - df449f266253aa3f52e55c53de147773e00f0f9aca86c64d94c95382265968c354c5eab4 - 9638c5ae99c89de1e0997242edfb0b501744e11ff9762dfd951cffd93227cc513384e7e6 - e782c29435c7ec2edafaa2f4c1fe6e7a781b59549ff5296371b42133777dcc5b8b971610 - 94bc67ede19e43ddb9dc2baacad374a36feaf0314d700af0a65c164b7082401740e489c9 - 7ef6a24defe4846104209bf0c3eced7fa1a672ed5b125fc9d8cd88b476a658a4434644ef - df8ae9a178e9f83ba9f08d10fa47e4226b98b0702f06b3b8 - -Create and save an encrypted key "evm" using the above trusted key "kmk": - - $ keyctl add encrypted evm "new trusted:kmk 32" @u - 159771175 - - $ keyctl print 159771175 - trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55 - be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64 - 5972dcb82ab2dde83376d82b2e3c09ffc - - $ keyctl pipe 159771175 > evm.blob - -Load an encrypted key "evm" from saved blob: - - $ keyctl add encrypted evm "load `cat evm.blob`" @u - 831684262 - - $ keyctl print 831684262 - trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55 - be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64 - 5972dcb82ab2dde83376d82b2e3c09ffc - - -The initial consumer of trusted keys is EVM, which at boot time needs a high -quality symmetric key for HMAC protection of file metadata. The use of a -trusted key provides strong guarantees that the EVM key has not been -compromised by a user level problem, and when sealed to specific boot PCR -values, protects against boot and offline attacks. Other uses for trusted and -encrypted keys, such as for disk and file encryption are anticipated. diff --git a/Documentation/keys.txt b/Documentation/keys.txt deleted file mode 100644 index 6523a9e6f29..00000000000 --- a/Documentation/keys.txt +++ /dev/null @@ -1,1290 +0,0 @@ - ============================ - KERNEL KEY RETENTION SERVICE - ============================ - -This service allows cryptographic keys, authentication tokens, cross-domain -user mappings, and similar to be cached in the kernel for the use of -filesystems and other kernel services. - -Keyrings are permitted; these are a special type of key that can hold links to -other keys. Processes each have three standard keyring subscriptions that a -kernel service can search for relevant keys. - -The key service can be configured on by enabling: - - "Security options"/"Enable access key retention support" (CONFIG_KEYS) - -This document has the following sections: - - - Key overview - - Key service overview - - Key access permissions - - SELinux support - - New procfs files - - Userspace system call interface - - Kernel services - - Notes on accessing payload contents - - Defining a key type - - Request-key callback service - - Garbage collection - - -============ -KEY OVERVIEW -============ - -In this context, keys represent units of cryptographic data, authentication -tokens, keyrings, etc.. These are represented in the kernel by struct key. - -Each key has a number of attributes: - - - A serial number. - - A type. - - A description (for matching a key in a search). - - Access control information. - - An expiry time. - - A payload. - - State. - - - (*) Each key is issued a serial number of type key_serial_t that is unique for - the lifetime of that key. All serial numbers are positive non-zero 32-bit - integers. - - Userspace programs can use a key's serial numbers as a way to gain access - to it, subject to permission checking. - - (*) Each key is of a defined "type". Types must be registered inside the - kernel by a kernel service (such as a filesystem) before keys of that type - can be added or used. Userspace programs cannot define new types directly. - - Key types are represented in the kernel by struct key_type. This defines a - number of operations that can be performed on a key of that type. - - Should a type be removed from the system, all the keys of that type will - be invalidated. - - (*) Each key has a description. This should be a printable string. The key - type provides an operation to perform a match between the description on a - key and a criterion string. - - (*) Each key has an owner user ID, a group ID and a permissions mask. These - are used to control what a process may do to a key from userspace, and - whether a kernel service will be able to find the key. - - (*) Each key can be set to expire at a specific time by the key type's - instantiation function. Keys can also be immortal. - - (*) Each key can have a payload. This is a quantity of data that represent the - actual "key". In the case of a keyring, this is a list of keys to which - the keyring links; in the case of a user-defined key, it's an arbitrary - blob of data. - - Having a payload is not required; and the payload can, in fact, just be a - value stored in the struct key itself. - - When a key is instantiated, the key type's instantiation function is - called with a blob of data, and that then creates the key's payload in - some way. - - Similarly, when userspace wants to read back the contents of the key, if - permitted, another key type operation will be called to convert the key's - attached payload back into a blob of data. - - (*) Each key can be in one of a number of basic states: - - (*) Uninstantiated. The key exists, but does not have any data attached. - Keys being requested from userspace will be in this state. - - (*) Instantiated. This is the normal state. The key is fully formed, and - has data attached. - - (*) Negative. This is a relatively short-lived state. The key acts as a - note saying that a previous call out to userspace failed, and acts as - a throttle on key lookups. A negative key can be updated to a normal - state. - - (*) Expired. Keys can have lifetimes set. If their lifetime is exceeded, - they traverse to this state. An expired key can be updated back to a - normal state. - - (*) Revoked. A key is put in this state by userspace action. It can't be - found or operated upon (apart from by unlinking it). - - (*) Dead. The key's type was unregistered, and so the key is now useless. - -Keys in the last three states are subject to garbage collection. See the -section on "Garbage collection". - - -==================== -KEY SERVICE OVERVIEW -==================== - -The key service provides a number of features besides keys: - - (*) The key service defines two special key types: - - (+) "keyring" - - Keyrings are special keys that contain a list of other keys. Keyring - lists can be modified using various system calls. Keyrings should not - be given a payload when created. - - (+) "user" - - A key of this type has a description and a payload that are arbitrary - blobs of data. These can be created, updated and read by userspace, - and aren't intended for use by kernel services. - - (*) Each process subscribes to three keyrings: a thread-specific keyring, a - process-specific keyring, and a session-specific keyring. - - The thread-specific keyring is discarded from the child when any sort of - clone, fork, vfork or execve occurs. A new keyring is created only when - required. - - The process-specific keyring is replaced with an empty one in the child on - clone, fork, vfork unless CLONE_THREAD is supplied, in which case it is - shared. execve also discards the process's process keyring and creates a - new one. - - The session-specific keyring is persistent across clone, fork, vfork and - execve, even when the latter executes a set-UID or set-GID binary. A - process can, however, replace its current session keyring with a new one - by using PR_JOIN_SESSION_KEYRING. It is permitted to request an anonymous - new one, or to attempt to create or join one of a specific name. - - The ownership of the thread keyring changes when the real UID and GID of - the thread changes. - - (*) Each user ID resident in the system holds two special keyrings: a user - specific keyring and a default user session keyring. The default session - keyring is initialised with a link to the user-specific keyring. - - When a process changes its real UID, if it used to have no session key, it - will be subscribed to the default session key for the new UID. - - If a process attempts to access its session key when it doesn't have one, - it will be subscribed to the default for its current UID. - - (*) Each user has two quotas against which the keys they own are tracked. One - limits the total number of keys and keyrings, the other limits the total - amount of description and payload space that can be consumed. - - The user can view information on this and other statistics through procfs - files. The root user may also alter the quota limits through sysctl files - (see the section "New procfs files"). - - Process-specific and thread-specific keyrings are not counted towards a - user's quota. - - If a system call that modifies a key or keyring in some way would put the - user over quota, the operation is refused and error EDQUOT is returned. - - (*) There's a system call interface by which userspace programs can create and - manipulate keys and keyrings. - - (*) There's a kernel interface by which services can register types and search - for keys. - - (*) There's a way for the a search done from the kernel to call back to - userspace to request a key that can't be found in a process's keyrings. - - (*) An optional filesystem is available through which the key database can be - viewed and manipulated. - - -====================== -KEY ACCESS PERMISSIONS -====================== - -Keys have an owner user ID, a group access ID, and a permissions mask. The mask -has up to eight bits each for possessor, user, group and other access. Only -six of each set of eight bits are defined. These permissions granted are: - - (*) View - - This permits a key or keyring's attributes to be viewed - including key - type and description. - - (*) Read - - This permits a key's payload to be viewed or a keyring's list of linked - keys. - - (*) Write - - This permits a key's payload to be instantiated or updated, or it allows a - link to be added to or removed from a keyring. - - (*) Search - - This permits keyrings to be searched and keys to be found. Searches can - only recurse into nested keyrings that have search permission set. - - (*) Link - - This permits a key or keyring to be linked to. To create a link from a - keyring to a key, a process must have Write permission on the keyring and - Link permission on the key. - - (*) Set Attribute - - This permits a key's UID, GID and permissions mask to be changed. - -For changing the ownership, group ID or permissions mask, being the owner of -the key or having the sysadmin capability is sufficient. - - -=============== -SELINUX SUPPORT -=============== - -The security class "key" has been added to SELinux so that mandatory access -controls can be applied to keys created within various contexts. This support -is preliminary, and is likely to change quite significantly in the near future. -Currently, all of the basic permissions explained above are provided in SELinux -as well; SELinux is simply invoked after all basic permission checks have been -performed. - -The value of the file /proc/self/attr/keycreate influences the labeling of -newly-created keys. If the contents of that file correspond to an SELinux -security context, then the key will be assigned that context. Otherwise, the -key will be assigned the current context of the task that invoked the key -creation request. Tasks must be granted explicit permission to assign a -particular context to newly-created keys, using the "create" permission in the -key security class. - -The default keyrings associated with users will be labeled with the default -context of the user if and only if the login programs have been instrumented to -properly initialize keycreate during the login process. Otherwise, they will -be labeled with the context of the login program itself. - -Note, however, that the default keyrings associated with the root user are -labeled with the default kernel context, since they are created early in the -boot process, before root has a chance to log in. - -The keyrings associated with new threads are each labeled with the context of -their associated thread, and both session and process keyrings are handled -similarly. - - -================ -NEW PROCFS FILES -================ - -Two files have been added to procfs by which an administrator can find out -about the status of the key service: - - (*) /proc/keys - - This lists the keys that are currently viewable by the task reading the - file, giving information about their type, description and permissions. - It is not possible to view the payload of the key this way, though some - information about it may be given. - - The only keys included in the list are those that grant View permission to - the reading process whether or not it possesses them. Note that LSM - security checks are still performed, and may further filter out keys that - the current process is not authorised to view. - - The contents of the file look like this: - - SERIAL FLAGS USAGE EXPY PERM UID GID TYPE DESCRIPTION: SUMMARY - 00000001 I----- 39 perm 1f3f0000 0 0 keyring _uid_ses.0: 1/4 - 00000002 I----- 2 perm 1f3f0000 0 0 keyring _uid.0: empty - 00000007 I----- 1 perm 1f3f0000 0 0 keyring _pid.1: empty - 0000018d I----- 1 perm 1f3f0000 0 0 keyring _pid.412: empty - 000004d2 I--Q-- 1 perm 1f3f0000 32 -1 keyring _uid.32: 1/4 - 000004d3 I--Q-- 3 perm 1f3f0000 32 -1 keyring _uid_ses.32: empty - 00000892 I--QU- 1 perm 1f000000 0 0 user metal:copper: 0 - 00000893 I--Q-N 1 35s 1f3f0000 0 0 user metal:silver: 0 - 00000894 I--Q-- 1 10h 003f0000 0 0 user metal:gold: 0 - - The flags are: - - I Instantiated - R Revoked - D Dead - Q Contributes to user's quota - U Under construction by callback to userspace - N Negative key - - This file must be enabled at kernel configuration time as it allows anyone - to list the keys database. - - (*) /proc/key-users - - This file lists the tracking data for each user that has at least one key - on the system. Such data includes quota information and statistics: - - [root@andromeda root]# cat /proc/key-users - 0: 46 45/45 1/100 13/10000 - 29: 2 2/2 2/100 40/10000 - 32: 2 2/2 2/100 40/10000 - 38: 2 2/2 2/100 40/10000 - - The format of each line is - : User ID to which this applies - Structure refcount - / Total number of keys and number instantiated - / Key count quota - / Key size quota - - -Four new sysctl files have been added also for the purpose of controlling the -quota limits on keys: - - (*) /proc/sys/kernel/keys/root_maxkeys - /proc/sys/kernel/keys/root_maxbytes - - These files hold the maximum number of keys that root may have and the - maximum total number of bytes of data that root may have stored in those - keys. - - (*) /proc/sys/kernel/keys/maxkeys - /proc/sys/kernel/keys/maxbytes - - These files hold the maximum number of keys that each non-root user may - have and the maximum total number of bytes of data that each of those - users may have stored in their keys. - -Root may alter these by writing each new limit as a decimal number string to -the appropriate file. - - -=============================== -USERSPACE SYSTEM CALL INTERFACE -=============================== - -Userspace can manipulate keys directly through three new syscalls: add_key, -request_key and keyctl. The latter provides a number of functions for -manipulating keys. - -When referring to a key directly, userspace programs should use the key's -serial number (a positive 32-bit integer). However, there are some special -values available for referring to special keys and keyrings that relate to the -process making the call: - - CONSTANT VALUE KEY REFERENCED - ============================== ====== =========================== - KEY_SPEC_THREAD_KEYRING -1 thread-specific keyring - KEY_SPEC_PROCESS_KEYRING -2 process-specific keyring - KEY_SPEC_SESSION_KEYRING -3 session-specific keyring - KEY_SPEC_USER_KEYRING -4 UID-specific keyring - KEY_SPEC_USER_SESSION_KEYRING -5 UID-session keyring - KEY_SPEC_GROUP_KEYRING -6 GID-specific keyring - KEY_SPEC_REQKEY_AUTH_KEY -7 assumed request_key() - authorisation key - - -The main syscalls are: - - (*) Create a new key of given type, description and payload and add it to the - nominated keyring: - - key_serial_t add_key(const char *type, const char *desc, - const void *payload, size_t plen, - key_serial_t keyring); - - If a key of the same type and description as that proposed already exists - in the keyring, this will try to update it with the given payload, or it - will return error EEXIST if that function is not supported by the key - type. The process must also have permission to write to the key to be able - to update it. The new key will have all user permissions granted and no - group or third party permissions. - - Otherwise, this will attempt to create a new key of the specified type and - description, and to instantiate it with the supplied payload and attach it - to the keyring. In this case, an error will be generated if the process - does not have permission to write to the keyring. - - The payload is optional, and the pointer can be NULL if not required by - the type. The payload is plen in size, and plen can be zero for an empty - payload. - - A new keyring can be generated by setting type "keyring", the keyring name - as the description (or NULL) and setting the payload to NULL. - - User defined keys can be created by specifying type "user". It is - recommended that a user defined key's description by prefixed with a type - ID and a colon, such as "krb5tgt:" for a Kerberos 5 ticket granting - ticket. - - Any other type must have been registered with the kernel in advance by a - kernel service such as a filesystem. - - The ID of the new or updated key is returned if successful. - - - (*) Search the process's keyrings for a key, potentially calling out to - userspace to create it. - - key_serial_t request_key(const char *type, const char *description, - const char *callout_info, - key_serial_t dest_keyring); - - This function searches all the process's keyrings in the order thread, - process, session for a matching key. This works very much like - KEYCTL_SEARCH, including the optional attachment of the discovered key to - a keyring. - - If a key cannot be found, and if callout_info is not NULL, then - /sbin/request-key will be invoked in an attempt to obtain a key. The - callout_info string will be passed as an argument to the program. - - See also Documentation/keys-request-key.txt. - - -The keyctl syscall functions are: - - (*) Map a special key ID to a real key ID for this process: - - key_serial_t keyctl(KEYCTL_GET_KEYRING_ID, key_serial_t id, - int create); - - The special key specified by "id" is looked up (with the key being created - if necessary) and the ID of the key or keyring thus found is returned if - it exists. - - If the key does not yet exist, the key will be created if "create" is - non-zero; and the error ENOKEY will be returned if "create" is zero. - - - (*) Replace the session keyring this process subscribes to with a new one: - - key_serial_t keyctl(KEYCTL_JOIN_SESSION_KEYRING, const char *name); - - If name is NULL, an anonymous keyring is created attached to the process - as its session keyring, displacing the old session keyring. - - If name is not NULL, if a keyring of that name exists, the process - attempts to attach it as the session keyring, returning an error if that - is not permitted; otherwise a new keyring of that name is created and - attached as the session keyring. - - To attach to a named keyring, the keyring must have search permission for - the process's ownership. - - The ID of the new session keyring is returned if successful. - - - (*) Update the specified key: - - long keyctl(KEYCTL_UPDATE, key_serial_t key, const void *payload, - size_t plen); - - This will try to update the specified key with the given payload, or it - will return error EOPNOTSUPP if that function is not supported by the key - type. The process must also have permission to write to the key to be able - to update it. - - The payload is of length plen, and may be absent or empty as for - add_key(). - - - (*) Revoke a key: - - long keyctl(KEYCTL_REVOKE, key_serial_t key); - - This makes a key unavailable for further operations. Further attempts to - use the key will be met with error EKEYREVOKED, and the key will no longer - be findable. - - - (*) Change the ownership of a key: - - long keyctl(KEYCTL_CHOWN, key_serial_t key, uid_t uid, gid_t gid); - - This function permits a key's owner and group ID to be changed. Either one - of uid or gid can be set to -1 to suppress that change. - - Only the superuser can change a key's owner to something other than the - key's current owner. Similarly, only the superuser can change a key's - group ID to something other than the calling process's group ID or one of - its group list members. - - - (*) Change the permissions mask on a key: - - long keyctl(KEYCTL_SETPERM, key_serial_t key, key_perm_t perm); - - This function permits the owner of a key or the superuser to change the - permissions mask on a key. - - Only bits the available bits are permitted; if any other bits are set, - error EINVAL will be returned. - - - (*) Describe a key: - - long keyctl(KEYCTL_DESCRIBE, key_serial_t key, char *buffer, - size_t buflen); - - This function returns a summary of the key's attributes (but not its - payload data) as a string in the buffer provided. - - Unless there's an error, it always returns the amount of data it could - produce, even if that's too big for the buffer, but it won't copy more - than requested to userspace. If the buffer pointer is NULL then no copy - will take place. - - A process must have view permission on the key for this function to be - successful. - - If successful, a string is placed in the buffer in the following format: - - ;;;; - - Where type and description are strings, uid and gid are decimal, and perm - is hexadecimal. A NUL character is included at the end of the string if - the buffer is sufficiently big. - - This can be parsed with - - sscanf(buffer, "%[^;];%d;%d;%o;%s", type, &uid, &gid, &mode, desc); - - - (*) Clear out a keyring: - - long keyctl(KEYCTL_CLEAR, key_serial_t keyring); - - This function clears the list of keys attached to a keyring. The calling - process must have write permission on the keyring, and it must be a - keyring (or else error ENOTDIR will result). - - - (*) Link a key into a keyring: - - long keyctl(KEYCTL_LINK, key_serial_t keyring, key_serial_t key); - - This function creates a link from the keyring to the key. The process must - have write permission on the keyring and must have link permission on the - key. - - Should the keyring not be a keyring, error ENOTDIR will result; and if the - keyring is full, error ENFILE will result. - - The link procedure checks the nesting of the keyrings, returning ELOOP if - it appears too deep or EDEADLK if the link would introduce a cycle. - - Any links within the keyring to keys that match the new key in terms of - type and description will be discarded from the keyring as the new one is - added. - - - (*) Unlink a key or keyring from another keyring: - - long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key); - - This function looks through the keyring for the first link to the - specified key, and removes it if found. Subsequent links to that key are - ignored. The process must have write permission on the keyring. - - If the keyring is not a keyring, error ENOTDIR will result; and if the key - is not present, error ENOENT will be the result. - - - (*) Search a keyring tree for a key: - - key_serial_t keyctl(KEYCTL_SEARCH, key_serial_t keyring, - const char *type, const char *description, - key_serial_t dest_keyring); - - This searches the keyring tree headed by the specified keyring until a key - is found that matches the type and description criteria. Each keyring is - checked for keys before recursion into its children occurs. - - The process must have search permission on the top level keyring, or else - error EACCES will result. Only keyrings that the process has search - permission on will be recursed into, and only keys and keyrings for which - a process has search permission can be matched. If the specified keyring - is not a keyring, ENOTDIR will result. - - If the search succeeds, the function will attempt to link the found key - into the destination keyring if one is supplied (non-zero ID). All the - constraints applicable to KEYCTL_LINK apply in this case too. - - Error ENOKEY, EKEYREVOKED or EKEYEXPIRED will be returned if the search - fails. On success, the resulting key ID will be returned. - - - (*) Read the payload data from a key: - - long keyctl(KEYCTL_READ, key_serial_t keyring, char *buffer, - size_t buflen); - - This function attempts to read the payload data from the specified key - into the buffer. The process must have read permission on the key to - succeed. - - The returned data will be processed for presentation by the key type. For - instance, a keyring will return an array of key_serial_t entries - representing the IDs of all the keys to which it is subscribed. The user - defined key type will return its data as is. If a key type does not - implement this function, error EOPNOTSUPP will result. - - As much of the data as can be fitted into the buffer will be copied to - userspace if the buffer pointer is not NULL. - - On a successful return, the function will always return the amount of data - available rather than the amount copied. - - - (*) Instantiate a partially constructed key. - - long keyctl(KEYCTL_INSTANTIATE, key_serial_t key, - const void *payload, size_t plen, - key_serial_t keyring); - long keyctl(KEYCTL_INSTANTIATE_IOV, key_serial_t key, - const struct iovec *payload_iov, unsigned ioc, - key_serial_t keyring); - - If the kernel calls back to userspace to complete the instantiation of a - key, userspace should use this call to supply data for the key before the - invoked process returns, or else the key will be marked negative - automatically. - - The process must have write access on the key to be able to instantiate - it, and the key must be uninstantiated. - - If a keyring is specified (non-zero), the key will also be linked into - that keyring, however all the constraints applying in KEYCTL_LINK apply in - this case too. - - The payload and plen arguments describe the payload data as for add_key(). - - The payload_iov and ioc arguments describe the payload data in an iovec - array instead of a single buffer. - - - (*) Negatively instantiate a partially constructed key. - - long keyctl(KEYCTL_NEGATE, key_serial_t key, - unsigned timeout, key_serial_t keyring); - long keyctl(KEYCTL_REJECT, key_serial_t key, - unsigned timeout, unsigned error, key_serial_t keyring); - - If the kernel calls back to userspace to complete the instantiation of a - key, userspace should use this call mark the key as negative before the - invoked process returns if it is unable to fulfil the request. - - The process must have write access on the key to be able to instantiate - it, and the key must be uninstantiated. - - If a keyring is specified (non-zero), the key will also be linked into - that keyring, however all the constraints applying in KEYCTL_LINK apply in - this case too. - - If the key is rejected, future searches for it will return the specified - error code until the rejected key expires. Negating the key is the same - as rejecting the key with ENOKEY as the error code. - - - (*) Set the default request-key destination keyring. - - long keyctl(KEYCTL_SET_REQKEY_KEYRING, int reqkey_defl); - - This sets the default keyring to which implicitly requested keys will be - attached for this thread. reqkey_defl should be one of these constants: - - CONSTANT VALUE NEW DEFAULT KEYRING - ====================================== ====== ======================= - KEY_REQKEY_DEFL_NO_CHANGE -1 No change - KEY_REQKEY_DEFL_DEFAULT 0 Default[1] - KEY_REQKEY_DEFL_THREAD_KEYRING 1 Thread keyring - KEY_REQKEY_DEFL_PROCESS_KEYRING 2 Process keyring - KEY_REQKEY_DEFL_SESSION_KEYRING 3 Session keyring - KEY_REQKEY_DEFL_USER_KEYRING 4 User keyring - KEY_REQKEY_DEFL_USER_SESSION_KEYRING 5 User session keyring - KEY_REQKEY_DEFL_GROUP_KEYRING 6 Group keyring - - The old default will be returned if successful and error EINVAL will be - returned if reqkey_defl is not one of the above values. - - The default keyring can be overridden by the keyring indicated to the - request_key() system call. - - Note that this setting is inherited across fork/exec. - - [1] The default is: the thread keyring if there is one, otherwise - the process keyring if there is one, otherwise the session keyring if - there is one, otherwise the user default session keyring. - - - (*) Set the timeout on a key. - - long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout); - - This sets or clears the timeout on a key. The timeout can be 0 to clear - the timeout or a number of seconds to set the expiry time that far into - the future. - - The process must have attribute modification access on a key to set its - timeout. Timeouts may not be set with this function on negative, revoked - or expired keys. - - - (*) Assume the authority granted to instantiate a key - - long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key); - - This assumes or divests the authority required to instantiate the - specified key. Authority can only be assumed if the thread has the - authorisation key associated with the specified key in its keyrings - somewhere. - - Once authority is assumed, searches for keys will also search the - requester's keyrings using the requester's security label, UID, GID and - groups. - - If the requested authority is unavailable, error EPERM will be returned, - likewise if the authority has been revoked because the target key is - already instantiated. - - If the specified key is 0, then any assumed authority will be divested. - - The assumed authoritative key is inherited across fork and exec. - - - (*) Get the LSM security context attached to a key. - - long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer, - size_t buflen) - - This function returns a string that represents the LSM security context - attached to a key in the buffer provided. - - Unless there's an error, it always returns the amount of data it could - produce, even if that's too big for the buffer, but it won't copy more - than requested to userspace. If the buffer pointer is NULL then no copy - will take place. - - A NUL character is included at the end of the string if the buffer is - sufficiently big. This is included in the returned count. If no LSM is - in force then an empty string will be returned. - - A process must have view permission on the key for this function to be - successful. - - - (*) Install the calling process's session keyring on its parent. - - long keyctl(KEYCTL_SESSION_TO_PARENT); - - This functions attempts to install the calling process's session keyring - on to the calling process's parent, replacing the parent's current session - keyring. - - The calling process must have the same ownership as its parent, the - keyring must have the same ownership as the calling process, the calling - process must have LINK permission on the keyring and the active LSM module - mustn't deny permission, otherwise error EPERM will be returned. - - Error ENOMEM will be returned if there was insufficient memory to complete - the operation, otherwise 0 will be returned to indicate success. - - The keyring will be replaced next time the parent process leaves the - kernel and resumes executing userspace. - - -=============== -KERNEL SERVICES -=============== - -The kernel services for key management are fairly simple to deal with. They can -be broken down into two areas: keys and key types. - -Dealing with keys is fairly straightforward. Firstly, the kernel service -registers its type, then it searches for a key of that type. It should retain -the key as long as it has need of it, and then it should release it. For a -filesystem or device file, a search would probably be performed during the open -call, and the key released upon close. How to deal with conflicting keys due to -two different users opening the same file is left to the filesystem author to -solve. - -To access the key manager, the following header must be #included: - - - -Specific key types should have a header file under include/keys/ that should be -used to access that type. For keys of type "user", for example, that would be: - - - -Note that there are two different types of pointers to keys that may be -encountered: - - (*) struct key * - - This simply points to the key structure itself. Key structures will be at - least four-byte aligned. - - (*) key_ref_t - - This is equivalent to a struct key *, but the least significant bit is set - if the caller "possesses" the key. By "possession" it is meant that the - calling processes has a searchable link to the key from one of its - keyrings. There are three functions for dealing with these: - - key_ref_t make_key_ref(const struct key *key, - unsigned long possession); - - struct key *key_ref_to_ptr(const key_ref_t key_ref); - - unsigned long is_key_possessed(const key_ref_t key_ref); - - The first function constructs a key reference from a key pointer and - possession information (which must be 0 or 1 and not any other value). - - The second function retrieves the key pointer from a reference and the - third retrieves the possession flag. - -When accessing a key's payload contents, certain precautions must be taken to -prevent access vs modification races. See the section "Notes on accessing -payload contents" for more information. - -(*) To search for a key, call: - - struct key *request_key(const struct key_type *type, - const char *description, - const char *callout_info); - - This is used to request a key or keyring with a description that matches - the description specified according to the key type's match function. This - permits approximate matching to occur. If callout_string is not NULL, then - /sbin/request-key will be invoked in an attempt to obtain the key from - userspace. In that case, callout_string will be passed as an argument to - the program. - - Should the function fail error ENOKEY, EKEYEXPIRED or EKEYREVOKED will be - returned. - - If successful, the key will have been attached to the default keyring for - implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING. - - See also Documentation/keys-request-key.txt. - - -(*) To search for a key, passing auxiliary data to the upcaller, call: - - struct key *request_key_with_auxdata(const struct key_type *type, - const char *description, - const void *callout_info, - size_t callout_len, - void *aux); - - This is identical to request_key(), except that the auxiliary data is - passed to the key_type->request_key() op if it exists, and the callout_info - is a blob of length callout_len, if given (the length may be 0). - - -(*) A key can be requested asynchronously by calling one of: - - struct key *request_key_async(const struct key_type *type, - const char *description, - const void *callout_info, - size_t callout_len); - - or: - - struct key *request_key_async_with_auxdata(const struct key_type *type, - const char *description, - const char *callout_info, - size_t callout_len, - void *aux); - - which are asynchronous equivalents of request_key() and - request_key_with_auxdata() respectively. - - These two functions return with the key potentially still under - construction. To wait for construction completion, the following should be - called: - - int wait_for_key_construction(struct key *key, bool intr); - - The function will wait for the key to finish being constructed and then - invokes key_validate() to return an appropriate value to indicate the state - of the key (0 indicates the key is usable). - - If intr is true, then the wait can be interrupted by a signal, in which - case error ERESTARTSYS will be returned. - - -(*) When it is no longer required, the key should be released using: - - void key_put(struct key *key); - - Or: - - void key_ref_put(key_ref_t key_ref); - - These can be called from interrupt context. If CONFIG_KEYS is not set then - the argument will not be parsed. - - -(*) Extra references can be made to a key by calling the following function: - - struct key *key_get(struct key *key); - - These need to be disposed of by calling key_put() when they've been - finished with. The key pointer passed in will be returned. If the pointer - is NULL or CONFIG_KEYS is not set then the key will not be dereferenced and - no increment will take place. - - -(*) A key's serial number can be obtained by calling: - - key_serial_t key_serial(struct key *key); - - If key is NULL or if CONFIG_KEYS is not set then 0 will be returned (in the - latter case without parsing the argument). - - -(*) If a keyring was found in the search, this can be further searched by: - - key_ref_t keyring_search(key_ref_t keyring_ref, - const struct key_type *type, - const char *description) - - This searches the keyring tree specified for a matching key. Error ENOKEY - is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful, - the returned key will need to be released. - - The possession attribute from the keyring reference is used to control - access through the permissions mask and is propagated to the returned key - reference pointer if successful. - - -(*) To check the validity of a key, this function can be called: - - int validate_key(struct key *key); - - This checks that the key in question hasn't expired or and hasn't been - revoked. Should the key be invalid, error EKEYEXPIRED or EKEYREVOKED will - be returned. If the key is NULL or if CONFIG_KEYS is not set then 0 will be - returned (in the latter case without parsing the argument). - - -(*) To register a key type, the following function should be called: - - int register_key_type(struct key_type *type); - - This will return error EEXIST if a type of the same name is already - present. - - -(*) To unregister a key type, call: - - void unregister_key_type(struct key_type *type); - - -Under some circumstances, it may be desirable to deal with a bundle of keys. -The facility provides access to the keyring type for managing such a bundle: - - struct key_type key_type_keyring; - -This can be used with a function such as request_key() to find a specific -keyring in a process's keyrings. A keyring thus found can then be searched -with keyring_search(). Note that it is not possible to use request_key() to -search a specific keyring, so using keyrings in this way is of limited utility. - - -=================================== -NOTES ON ACCESSING PAYLOAD CONTENTS -=================================== - -The simplest payload is just a number in key->payload.value. In this case, -there's no need to indulge in RCU or locking when accessing the payload. - -More complex payload contents must be allocated and a pointer to them set in -key->payload.data. One of the following ways must be selected to access the -data: - - (1) Unmodifiable key type. - - If the key type does not have a modify method, then the key's payload can - be accessed without any form of locking, provided that it's known to be - instantiated (uninstantiated keys cannot be "found"). - - (2) The key's semaphore. - - The semaphore could be used to govern access to the payload and to control - the payload pointer. It must be write-locked for modifications and would - have to be read-locked for general access. The disadvantage of doing this - is that the accessor may be required to sleep. - - (3) RCU. - - RCU must be used when the semaphore isn't already held; if the semaphore - is held then the contents can't change under you unexpectedly as the - semaphore must still be used to serialise modifications to the key. The - key management code takes care of this for the key type. - - However, this means using: - - rcu_read_lock() ... rcu_dereference() ... rcu_read_unlock() - - to read the pointer, and: - - rcu_dereference() ... rcu_assign_pointer() ... call_rcu() - - to set the pointer and dispose of the old contents after a grace period. - Note that only the key type should ever modify a key's payload. - - Furthermore, an RCU controlled payload must hold a struct rcu_head for the - use of call_rcu() and, if the payload is of variable size, the length of - the payload. key->datalen cannot be relied upon to be consistent with the - payload just dereferenced if the key's semaphore is not held. - - -=================== -DEFINING A KEY TYPE -=================== - -A kernel service may want to define its own key type. For instance, an AFS -filesystem might want to define a Kerberos 5 ticket key type. To do this, it -author fills in a key_type struct and registers it with the system. - -Source files that implement key types should include the following header file: - - - -The structure has a number of fields, some of which are mandatory: - - (*) const char *name - - The name of the key type. This is used to translate a key type name - supplied by userspace into a pointer to the structure. - - - (*) size_t def_datalen - - This is optional - it supplies the default payload data length as - contributed to the quota. If the key type's payload is always or almost - always the same size, then this is a more efficient way to do things. - - The data length (and quota) on a particular key can always be changed - during instantiation or update by calling: - - int key_payload_reserve(struct key *key, size_t datalen); - - With the revised data length. Error EDQUOT will be returned if this is not - viable. - - - (*) int (*vet_description)(const char *description); - - This optional method is called to vet a key description. If the key type - doesn't approve of the key description, it may return an error, otherwise - it should return 0. - - - (*) int (*instantiate)(struct key *key, const void *data, size_t datalen); - - This method is called to attach a payload to a key during construction. - The payload attached need not bear any relation to the data passed to this - function. - - If the amount of data attached to the key differs from the size in - keytype->def_datalen, then key_payload_reserve() should be called. - - This method does not have to lock the key in order to attach a payload. - The fact that KEY_FLAG_INSTANTIATED is not set in key->flags prevents - anything else from gaining access to the key. - - It is safe to sleep in this method. - - - (*) int (*update)(struct key *key, const void *data, size_t datalen); - - If this type of key can be updated, then this method should be provided. - It is called to update a key's payload from the blob of data provided. - - key_payload_reserve() should be called if the data length might change - before any changes are actually made. Note that if this succeeds, the type - is committed to changing the key because it's already been altered, so all - memory allocation must be done first. - - The key will have its semaphore write-locked before this method is called, - but this only deters other writers; any changes to the key's payload must - be made under RCU conditions, and call_rcu() must be used to dispose of - the old payload. - - key_payload_reserve() should be called before the changes are made, but - after all allocations and other potentially failing function calls are - made. - - It is safe to sleep in this method. - - - (*) int (*match)(const struct key *key, const void *desc); - - This method is called to match a key against a description. It should - return non-zero if the two match, zero if they don't. - - This method should not need to lock the key in any way. The type and - description can be considered invariant, and the payload should not be - accessed (the key may not yet be instantiated). - - It is not safe to sleep in this method; the caller may hold spinlocks. - - - (*) void (*revoke)(struct key *key); - - This method is optional. It is called to discard part of the payload - data upon a key being revoked. The caller will have the key semaphore - write-locked. - - It is safe to sleep in this method, though care should be taken to avoid - a deadlock against the key semaphore. - - - (*) void (*destroy)(struct key *key); - - This method is optional. It is called to discard the payload data on a key - when it is being destroyed. - - This method does not need to lock the key to access the payload; it can - consider the key as being inaccessible at this time. Note that the key's - type may have been changed before this function is called. - - It is not safe to sleep in this method; the caller may hold spinlocks. - - - (*) void (*describe)(const struct key *key, struct seq_file *p); - - This method is optional. It is called during /proc/keys reading to - summarise a key's description and payload in text form. - - This method will be called with the RCU read lock held. rcu_dereference() - should be used to read the payload pointer if the payload is to be - accessed. key->datalen cannot be trusted to stay consistent with the - contents of the payload. - - The description will not change, though the key's state may. - - It is not safe to sleep in this method; the RCU read lock is held by the - caller. - - - (*) long (*read)(const struct key *key, char __user *buffer, size_t buflen); - - This method is optional. It is called by KEYCTL_READ to translate the - key's payload into something a blob of data for userspace to deal with. - Ideally, the blob should be in the same format as that passed in to the - instantiate and update methods. - - If successful, the blob size that could be produced should be returned - rather than the size copied. - - This method will be called with the key's semaphore read-locked. This will - prevent the key's payload changing. It is not necessary to use RCU locking - when accessing the key's payload. It is safe to sleep in this method, such - as might happen when the userspace buffer is accessed. - - - (*) int (*request_key)(struct key_construction *cons, const char *op, - void *aux); - - This method is optional. If provided, request_key() and friends will - invoke this function rather than upcalling to /sbin/request-key to operate - upon a key of this type. - - The aux parameter is as passed to request_key_async_with_auxdata() and - similar or is NULL otherwise. Also passed are the construction record for - the key to be operated upon and the operation type (currently only - "create"). - - This method is permitted to return before the upcall is complete, but the - following function must be called under all circumstances to complete the - instantiation process, whether or not it succeeds, whether or not there's - an error: - - void complete_request_key(struct key_construction *cons, int error); - - The error parameter should be 0 on success, -ve on error. The - construction record is destroyed by this action and the authorisation key - will be revoked. If an error is indicated, the key under construction - will be negatively instantiated if it wasn't already instantiated. - - If this method returns an error, that error will be returned to the - caller of request_key*(). complete_request_key() must be called prior to - returning. - - The key under construction and the authorisation key can be found in the - key_construction struct pointed to by cons: - - (*) struct key *key; - - The key under construction. - - (*) struct key *authkey; - - The authorisation key. - - -============================ -REQUEST-KEY CALLBACK SERVICE -============================ - -To create a new key, the kernel will attempt to execute the following command -line: - - /sbin/request-key create \ - - - is the key being constructed, and the three keyrings are the process -keyrings from the process that caused the search to be issued. These are -included for two reasons: - - (1) There may be an authentication token in one of the keyrings that is - required to obtain the key, eg: a Kerberos Ticket-Granting Ticket. - - (2) The new key should probably be cached in one of these rings. - -This program should set it UID and GID to those specified before attempting to -access any more keys. It may then look around for a user specific process to -hand the request off to (perhaps a path held in placed in another key by, for -example, the KDE desktop manager). - -The program (or whatever it calls) should finish construction of the key by -calling KEYCTL_INSTANTIATE or KEYCTL_INSTANTIATE_IOV, which also permits it to -cache the key in one of the keyrings (probably the session ring) before -returning. Alternatively, the key can be marked as negative with KEYCTL_NEGATE -or KEYCTL_REJECT; this also permits the key to be cached in one of the -keyrings. - -If it returns with the key remaining in the unconstructed state, the key will -be marked as being negative, it will be added to the session keyring, and an -error will be returned to the key requestor. - -Supplementary information may be provided from whoever or whatever invoked this -service. This will be passed as the parameter. If no such -information was made available, then "-" will be passed as this parameter -instead. - - -Similarly, the kernel may attempt to update an expired or a soon to expire key -by executing: - - /sbin/request-key update \ - - -In this case, the program isn't required to actually attach the key to a ring; -the rings are provided for reference. - - -================== -GARBAGE COLLECTION -================== - -Dead keys (for which the type has been removed) will be automatically unlinked -from those keyrings that point to them and deleted as soon as possible by a -background garbage collector. - -Similarly, revoked and expired keys will be garbage collected, but only after a -certain amount of time has passed. This time is set as a number of seconds in: - - /proc/sys/kernel/keys/gc_delay diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt index 04ca06325b0..7f531ad8328 100644 --- a/Documentation/networking/dns_resolver.txt +++ b/Documentation/networking/dns_resolver.txt @@ -139,8 +139,8 @@ the key will be discarded and recreated when the data it holds has expired. dns_query() returns a copy of the value attached to the key, or an error if that is indicated instead. -See for further information about -request-key function. +See for further +information about request-key function. ========= diff --git a/Documentation/security/00-INDEX b/Documentation/security/00-INDEX new file mode 100644 index 00000000000..19bc49439ca --- /dev/null +++ b/Documentation/security/00-INDEX @@ -0,0 +1,18 @@ +00-INDEX + - this file. +SELinux.txt + - how to get started with the SELinux security enhancement. +Smack.txt + - documentation on the Smack Linux Security Module. +apparmor.txt + - documentation on the AppArmor security extension. +credentials.txt + - documentation about credentials in Linux. +keys-request-key.txt + - description of the kernel key request service. +keys-trusted-encrypted.txt + - info on the Trusted and Encrypted keys in the kernel key ring service. +keys.txt + - description of the kernel key retention service. +tomoyo.txt + - documentation on the TOMOYO Linux Security Module. diff --git a/Documentation/security/SELinux.txt b/Documentation/security/SELinux.txt new file mode 100644 index 00000000000..07eae00f331 --- /dev/null +++ b/Documentation/security/SELinux.txt @@ -0,0 +1,27 @@ +If you want to use SELinux, chances are you will want +to use the distro-provided policies, or install the +latest reference policy release from + http://oss.tresys.com/projects/refpolicy + +However, if you want to install a dummy policy for +testing, you can do using 'mdp' provided under +scripts/selinux. Note that this requires the selinux +userspace to be installed - in particular you will +need checkpolicy to compile a kernel, and setfiles and +fixfiles to label the filesystem. + + 1. Compile the kernel with selinux enabled. + 2. Type 'make' to compile mdp. + 3. Make sure that you are not running with + SELinux enabled and a real policy. If + you are, reboot with selinux disabled + before continuing. + 4. Run install_policy.sh: + cd scripts/selinux + sh install_policy.sh + +Step 4 will create a new dummy policy valid for your +kernel, with a single selinux user, role, and type. +It will compile the policy, will set your SELINUXTYPE to +dummy in /etc/selinux/config, install the compiled policy +as 'dummy', and relabel your filesystem. diff --git a/Documentation/security/Smack.txt b/Documentation/security/Smack.txt new file mode 100644 index 00000000000..e9dab41c0fe --- /dev/null +++ b/Documentation/security/Smack.txt @@ -0,0 +1,541 @@ + + + "Good for you, you've decided to clean the elevator!" + - The Elevator, from Dark Star + +Smack is the the Simplified Mandatory Access Control Kernel. +Smack is a kernel based implementation of mandatory access +control that includes simplicity in its primary design goals. + +Smack is not the only Mandatory Access Control scheme +available for Linux. Those new to Mandatory Access Control +are encouraged to compare Smack with the other mechanisms +available to determine which is best suited to the problem +at hand. + +Smack consists of three major components: + - The kernel + - A start-up script and a few modified applications + - Configuration data + +The kernel component of Smack is implemented as a Linux +Security Modules (LSM) module. It requires netlabel and +works best with file systems that support extended attributes, +although xattr support is not strictly required. +It is safe to run a Smack kernel under a "vanilla" distribution. +Smack kernels use the CIPSO IP option. Some network +configurations are intolerant of IP options and can impede +access to systems that use them as Smack does. + +The startup script etc-init.d-smack should be installed +in /etc/init.d/smack and should be invoked early in the +start-up process. On Fedora rc5.d/S02smack is recommended. +This script ensures that certain devices have the correct +Smack attributes and loads the Smack configuration if +any is defined. This script invokes two programs that +ensure configuration data is properly formatted. These +programs are /usr/sbin/smackload and /usr/sin/smackcipso. +The system will run just fine without these programs, +but it will be difficult to set access rules properly. + +A version of "ls" that provides a "-M" option to display +Smack labels on long listing is available. + +A hacked version of sshd that allows network logins by users +with specific Smack labels is available. This version does +not work for scp. You must set the /etc/ssh/sshd_config +line: + UsePrivilegeSeparation no + +The format of /etc/smack/usr is: + + username smack + +In keeping with the intent of Smack, configuration data is +minimal and not strictly required. The most important +configuration step is mounting the smackfs pseudo filesystem. + +Add this line to /etc/fstab: + + smackfs /smack smackfs smackfsdef=* 0 0 + +and create the /smack directory for mounting. + +Smack uses extended attributes (xattrs) to store file labels. +The command to set a Smack label on a file is: + + # attr -S -s SMACK64 -V "value" path + +NOTE: Smack labels are limited to 23 characters. The attr command + does not enforce this restriction and can be used to set + invalid Smack labels on files. + +If you don't do anything special all users will get the floor ("_") +label when they log in. If you do want to log in via the hacked ssh +at other labels use the attr command to set the smack value on the +home directory and its contents. + +You can add access rules in /etc/smack/accesses. They take the form: + + subjectlabel objectlabel access + +access is a combination of the letters rwxa which specify the +kind of access permitted a subject with subjectlabel on an +object with objectlabel. If there is no rule no access is allowed. + +A process can see the smack label it is running with by +reading /proc/self/attr/current. A privileged process can +set the process smack by writing there. + +Look for additional programs on http://schaufler-ca.com + +From the Smack Whitepaper: + +The Simplified Mandatory Access Control Kernel + +Casey Schaufler +casey@schaufler-ca.com + +Mandatory Access Control + +Computer systems employ a variety of schemes to constrain how information is +shared among the people and services using the machine. Some of these schemes +allow the program or user to decide what other programs or users are allowed +access to pieces of data. These schemes are called discretionary access +control mechanisms because the access control is specified at the discretion +of the user. Other schemes do not leave the decision regarding what a user or +program can access up to users or programs. These schemes are called mandatory +access control mechanisms because you don't have a choice regarding the users +or programs that have access to pieces of data. + +Bell & LaPadula + +From the middle of the 1980's until the turn of the century Mandatory Access +Control (MAC) was very closely associated with the Bell & LaPadula security +model, a mathematical description of the United States Department of Defense +policy for marking paper documents. MAC in this form enjoyed a following +within the Capital Beltway and Scandinavian supercomputer centers but was +often sited as failing to address general needs. + +Domain Type Enforcement + +Around the turn of the century Domain Type Enforcement (DTE) became popular. +This scheme organizes users, programs, and data into domains that are +protected from each other. This scheme has been widely deployed as a component +of popular Linux distributions. The administrative overhead required to +maintain this scheme and the detailed understanding of the whole system +necessary to provide a secure domain mapping leads to the scheme being +disabled or used in limited ways in the majority of cases. + +Smack + +Smack is a Mandatory Access Control mechanism designed to provide useful MAC +while avoiding the pitfalls of its predecessors. The limitations of Bell & +LaPadula are addressed by providing a scheme whereby access can be controlled +according to the requirements of the system and its purpose rather than those +imposed by an arcane government policy. The complexity of Domain Type +Enforcement and avoided by defining access controls in terms of the access +modes already in use. + +Smack Terminology + +The jargon used to talk about Smack will be familiar to those who have dealt +with other MAC systems and shouldn't be too difficult for the uninitiated to +pick up. There are four terms that are used in a specific way and that are +especially important: + + Subject: A subject is an active entity on the computer system. + On Smack a subject is a task, which is in turn the basic unit + of execution. + + Object: An object is a passive entity on the computer system. + On Smack files of all types, IPC, and tasks can be objects. + + Access: Any attempt by a subject to put information into or get + information from an object is an access. + + Label: Data that identifies the Mandatory Access Control + characteristics of a subject or an object. + +These definitions are consistent with the traditional use in the security +community. There are also some terms from Linux that are likely to crop up: + + Capability: A task that possesses a capability has permission to + violate an aspect of the system security policy, as identified by + the specific capability. A task that possesses one or more + capabilities is a privileged task, whereas a task with no + capabilities is an unprivileged task. + + Privilege: A task that is allowed to violate the system security + policy is said to have privilege. As of this writing a task can + have privilege either by possessing capabilities or by having an + effective user of root. + +Smack Basics + +Smack is an extension to a Linux system. It enforces additional restrictions +on what subjects can access which objects, based on the labels attached to +each of the subject and the object. + +Labels + +Smack labels are ASCII character strings, one to twenty-three characters in +length. Single character labels using special characters, that being anything +other than a letter or digit, are reserved for use by the Smack development +team. Smack labels are unstructured, case sensitive, and the only operation +ever performed on them is comparison for equality. Smack labels cannot +contain unprintable characters, the "/" (slash), the "\" (backslash), the "'" +(quote) and '"' (double-quote) characters. +Smack labels cannot begin with a '-', which is reserved for special options. + +There are some predefined labels: + + _ Pronounced "floor", a single underscore character. + ^ Pronounced "hat", a single circumflex character. + * Pronounced "star", a single asterisk character. + ? Pronounced "huh", a single question mark character. + @ Pronounced "Internet", a single at sign character. + +Every task on a Smack system is assigned a label. System tasks, such as +init(8) and systems daemons, are run with the floor ("_") label. User tasks +are assigned labels according to the specification found in the +/etc/smack/user configuration file. + +Access Rules + +Smack uses the traditional access modes of Linux. These modes are read, +execute, write, and occasionally append. There are a few cases where the +access mode may not be obvious. These include: + + Signals: A signal is a write operation from the subject task to + the object task. + Internet Domain IPC: Transmission of a packet is considered a + write operation from the source task to the destination task. + +Smack restricts access based on the label attached to a subject and the label +attached to the object it is trying to access. The rules enforced are, in +order: + + 1. Any access requested by a task labeled "*" is denied. + 2. A read or execute access requested by a task labeled "^" + is permitted. + 3. A read or execute access requested on an object labeled "_" + is permitted. + 4. Any access requested on an object labeled "*" is permitted. + 5. Any access requested by a task on an object with the same + label is permitted. + 6. Any access requested that is explicitly defined in the loaded + rule set is permitted. + 7. Any other access is denied. + +Smack Access Rules + +With the isolation provided by Smack access separation is simple. There are +many interesting cases where limited access by subjects to objects with +different labels is desired. One example is the familiar spy model of +sensitivity, where a scientist working on a highly classified project would be +able to read documents of lower classifications and anything she writes will +be "born" highly classified. To accommodate such schemes Smack includes a +mechanism for specifying rules allowing access between labels. + +Access Rule Format + +The format of an access rule is: + + subject-label object-label access + +Where subject-label is the Smack label of the task, object-label is the Smack +label of the thing being accessed, and access is a string specifying the sort +of access allowed. The Smack labels are limited to 23 characters. The access +specification is searched for letters that describe access modes: + + a: indicates that append access should be granted. + r: indicates that read access should be granted. + w: indicates that write access should be granted. + x: indicates that execute access should be granted. + +Uppercase values for the specification letters are allowed as well. +Access mode specifications can be in any order. Examples of acceptable rules +are: + + TopSecret Secret rx + Secret Unclass R + Manager Game x + User HR w + New Old rRrRr + Closed Off - + +Examples of unacceptable rules are: + + Top Secret Secret rx + Ace Ace r + Odd spells waxbeans + +Spaces are not allowed in labels. Since a subject always has access to files +with the same label specifying a rule for that case is pointless. Only +valid letters (rwxaRWXA) and the dash ('-') character are allowed in +access specifications. The dash is a placeholder, so "a-r" is the same +as "ar". A lone dash is used to specify that no access should be allowed. + +Applying Access Rules + +The developers of Linux rarely define new sorts of things, usually importing +schemes and concepts from other systems. Most often, the other systems are +variants of Unix. Unix has many endearing properties, but consistency of +access control models is not one of them. Smack strives to treat accesses as +uniformly as is sensible while keeping with the spirit of the underlying +mechanism. + +File system objects including files, directories, named pipes, symbolic links, +and devices require access permissions that closely match those used by mode +bit access. To open a file for reading read access is required on the file. To +search a directory requires execute access. Creating a file with write access +requires both read and write access on the containing directory. Deleting a +file requires read and write access to the file and to the containing +directory. It is possible that a user may be able to see that a file exists +but not any of its attributes by the circumstance of having read access to the +containing directory but not to the differently labeled file. This is an +artifact of the file name being data in the directory, not a part of the file. + +IPC objects, message queues, semaphore sets, and memory segments exist in flat +namespaces and access requests are only required to match the object in +question. + +Process objects reflect tasks on the system and the Smack label used to access +them is the same Smack label that the task would use for its own access +attempts. Sending a signal via the kill() system call is a write operation +from the signaler to the recipient. Debugging a process requires both reading +and writing. Creating a new task is an internal operation that results in two +tasks with identical Smack labels and requires no access checks. + +Sockets are data structures attached to processes and sending a packet from +one process to another requires that the sender have write access to the +receiver. The receiver is not required to have read access to the sender. + +Setting Access Rules + +The configuration file /etc/smack/accesses contains the rules to be set at +system startup. The contents are written to the special file /smack/load. +Rules can be written to /smack/load at any time and take effect immediately. +For any pair of subject and object labels there can be only one rule, with the +most recently specified overriding any earlier specification. + +The program smackload is provided to ensure data is formatted +properly when written to /smack/load. This program reads lines +of the form + + subjectlabel objectlabel mode. + +Task Attribute + +The Smack label of a process can be read from /proc//attr/current. A +process can read its own Smack label from /proc/self/attr/current. A +privileged process can change its own Smack label by writing to +/proc/self/attr/current but not the label of another process. + +File Attribute + +The Smack label of a filesystem object is stored as an extended attribute +named SMACK64 on the file. This attribute is in the security namespace. It can +only be changed by a process with privilege. + +Privilege + +A process with CAP_MAC_OVERRIDE is privileged. + +Smack Networking + +As mentioned before, Smack enforces access control on network protocol +transmissions. Every packet sent by a Smack process is tagged with its Smack +label. This is done by adding a CIPSO tag to the header of the IP packet. Each +packet received is expected to have a CIPSO tag that identifies the label and +if it lacks such a tag the network ambient label is assumed. Before the packet +is delivered a check is made to determine that a subject with the label on the +packet has write access to the receiving process and if that is not the case +the packet is dropped. + +CIPSO Configuration + +It is normally unnecessary to specify the CIPSO configuration. The default +values used by the system handle all internal cases. Smack will compose CIPSO +label values to match the Smack labels being used without administrative +intervention. Unlabeled packets that come into the system will be given the +ambient label. + +Smack requires configuration in the case where packets from a system that is +not smack that speaks CIPSO may be encountered. Usually this will be a Trusted +Solaris system, but there are other, less widely deployed systems out there. +CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level, +and a category set with each packet. The DOI is intended to identify a group +of systems that use compatible labeling schemes, and the DOI specified on the +smack system must match that of the remote system or packets will be +discarded. The DOI is 3 by default. The value can be read from /smack/doi and +can be changed by writing to /smack/doi. + +The label and category set are mapped to a Smack label as defined in +/etc/smack/cipso. + +A Smack/CIPSO mapping has the form: + + smack level [category [category]*] + +Smack does not expect the level or category sets to be related in any +particular way and does not assume or assign accesses based on them. Some +examples of mappings: + + TopSecret 7 + TS:A,B 7 1 2 + SecBDE 5 2 4 6 + RAFTERS 7 12 26 + +The ":" and "," characters are permitted in a Smack label but have no special +meaning. + +The mapping of Smack labels to CIPSO values is defined by writing to +/smack/cipso. Again, the format of data written to this special file +is highly restrictive, so the program smackcipso is provided to +ensure the writes are done properly. This program takes mappings +on the standard input and sends them to /smack/cipso properly. + +In addition to explicit mappings Smack supports direct CIPSO mappings. One +CIPSO level is used to indicate that the category set passed in the packet is +in fact an encoding of the Smack label. The level used is 250 by default. The +value can be read from /smack/direct and changed by writing to /smack/direct. + +Socket Attributes + +There are two attributes that are associated with sockets. These attributes +can only be set by privileged tasks, but any task can read them for their own +sockets. + + SMACK64IPIN: The Smack label of the task object. A privileged + program that will enforce policy may set this to the star label. + + SMACK64IPOUT: The Smack label transmitted with outgoing packets. + A privileged program may set this to match the label of another + task with which it hopes to communicate. + +Smack Netlabel Exceptions + +You will often find that your labeled application has to talk to the outside, +unlabeled world. To do this there's a special file /smack/netlabel where you can +add some exceptions in the form of : +@IP1 LABEL1 or +@IP2/MASK LABEL2 + +It means that your application will have unlabeled access to @IP1 if it has +write access on LABEL1, and access to the subnet @IP2/MASK if it has write +access on LABEL2. + +Entries in the /smack/netlabel file are matched by longest mask first, like in +classless IPv4 routing. + +A special label '@' and an option '-CIPSO' can be used there : +@ means Internet, any application with any label has access to it +-CIPSO means standard CIPSO networking + +If you don't know what CIPSO is and don't plan to use it, you can just do : +echo 127.0.0.1 -CIPSO > /smack/netlabel +echo 0.0.0.0/0 @ > /smack/netlabel + +If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled +Internet access, you can have : +echo 127.0.0.1 -CIPSO > /smack/netlabel +echo 192.168.0.0/16 -CIPSO > /smack/netlabel +echo 0.0.0.0/0 @ > /smack/netlabel + + +Writing Applications for Smack + +There are three sorts of applications that will run on a Smack system. How an +application interacts with Smack will determine what it will have to do to +work properly under Smack. + +Smack Ignorant Applications + +By far the majority of applications have no reason whatever to care about the +unique properties of Smack. Since invoking a program has no impact on the +Smack label associated with the process the only concern likely to arise is +whether the process has execute access to the program. + +Smack Relevant Applications + +Some programs can be improved by teaching them about Smack, but do not make +any security decisions themselves. The utility ls(1) is one example of such a +program. + +Smack Enforcing Applications + +These are special programs that not only know about Smack, but participate in +the enforcement of system policy. In most cases these are the programs that +set up user sessions. There are also network services that provide information +to processes running with various labels. + +File System Interfaces + +Smack maintains labels on file system objects using extended attributes. The +Smack label of a file, directory, or other file system object can be obtained +using getxattr(2). + + len = getxattr("/", "security.SMACK64", value, sizeof (value)); + +will put the Smack label of the root directory into value. A privileged +process can set the Smack label of a file system object with setxattr(2). + + len = strlen("Rubble"); + rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0); + +will set the Smack label of /foo to "Rubble" if the program has appropriate +privilege. + +Socket Interfaces + +The socket attributes can be read using fgetxattr(2). + +A privileged process can set the Smack label of outgoing packets with +fsetxattr(2). + + len = strlen("Rubble"); + rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0); + +will set the Smack label "Rubble" on packets going out from the socket if the +program has appropriate privilege. + + rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0); + +will set the Smack label "*" as the object label against which incoming +packets will be checked if the program has appropriate privilege. + +Administration + +Smack supports some mount options: + + smackfsdef=label: specifies the label to give files that lack + the Smack label extended attribute. + + smackfsroot=label: specifies the label to assign the root of the + file system if it lacks the Smack extended attribute. + + smackfshat=label: specifies a label that must have read access to + all labels set on the filesystem. Not yet enforced. + + smackfsfloor=label: specifies a label to which all labels set on the + filesystem must have read access. Not yet enforced. + +These mount options apply to all file system types. + +Smack auditing + +If you want Smack auditing of security events, you need to set CONFIG_AUDIT +in your kernel configuration. +By default, all denied events will be audited. You can change this behavior by +writing a single character to the /smack/logging file : +0 : no logging +1 : log denied (default) +2 : log accepted +3 : log denied & accepted + +Events are logged as 'key=value' pairs, for each event you at least will get +the subjet, the object, the rights requested, the action, the kernel function +that triggered the event, plus other pairs depending on the type of event +audited. diff --git a/Documentation/security/apparmor.txt b/Documentation/security/apparmor.txt new file mode 100644 index 00000000000..93c1fd7d063 --- /dev/null +++ b/Documentation/security/apparmor.txt @@ -0,0 +1,39 @@ +--- What is AppArmor? --- + +AppArmor is MAC style security extension for the Linux kernel. It implements +a task centered policy, with task "profiles" being created and loaded +from user space. Tasks on the system that do not have a profile defined for +them run in an unconfined state which is equivalent to standard Linux DAC +permissions. + +--- How to enable/disable --- + +set CONFIG_SECURITY_APPARMOR=y + +If AppArmor should be selected as the default security module then + set CONFIG_DEFAULT_SECURITY="apparmor" + and CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 + +Build the kernel + +If AppArmor is not the default security module it can be enabled by passing +security=apparmor on the kernel's command line. + +If AppArmor is the default security module it can be disabled by passing +apparmor=0, security=XXXX (where XXX is valid security module), on the +kernel's command line + +For AppArmor to enforce any restrictions beyond standard Linux DAC permissions +policy must be loaded into the kernel from user space (see the Documentation +and tools links). + +--- Documentation --- + +Documentation can be found on the wiki. + +--- Links --- + +Mailing List - apparmor@lists.ubuntu.com +Wiki - http://apparmor.wiki.kernel.org/ +User space tools - https://launchpad.net/apparmor +Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git diff --git a/Documentation/security/credentials.txt b/Documentation/security/credentials.txt new file mode 100644 index 00000000000..fc0366cbd7c --- /dev/null +++ b/Documentation/security/credentials.txt @@ -0,0 +1,581 @@ + ==================== + CREDENTIALS IN LINUX + ==================== + +By: David Howells + +Contents: + + (*) Overview. + + (*) Types of credentials. + + (*) File markings. + + (*) Task credentials. + + - Immutable credentials. + - Accessing task credentials. + - Accessing another task's credentials. + - Altering credentials. + - Managing credentials. + + (*) Open file credentials. + + (*) Overriding the VFS's use of credentials. + + +======== +OVERVIEW +======== + +There are several parts to the security check performed by Linux when one +object acts upon another: + + (1) Objects. + + Objects are things in the system that may be acted upon directly by + userspace programs. Linux has a variety of actionable objects, including: + + - Tasks + - Files/inodes + - Sockets + - Message queues + - Shared memory segments + - Semaphores + - Keys + + As a part of the description of all these objects there is a set of + credentials. What's in the set depends on the type of object. + + (2) Object ownership. + + Amongst the credentials of most objects, there will be a subset that + indicates the ownership of that object. This is used for resource + accounting and limitation (disk quotas and task rlimits for example). + + In a standard UNIX filesystem, for instance, this will be defined by the + UID marked on the inode. + + (3) The objective context. + + Also amongst the credentials of those objects, there will be a subset that + indicates the 'objective context' of that object. This may or may not be + the same set as in (2) - in standard UNIX files, for instance, this is the + defined by the UID and the GID marked on the inode. + + The objective context is used as part of the security calculation that is + carried out when an object is acted upon. + + (4) Subjects. + + A subject is an object that is acting upon another object. + + Most of the objects in the system are inactive: they don't act on other + objects within the system. Processes/tasks are the obvious exception: + they do stuff; they access and manipulate things. + + Objects other than tasks may under some circumstances also be subjects. + For instance an open file may send SIGIO to a task using the UID and EUID + given to it by a task that called fcntl(F_SETOWN) upon it. In this case, + the file struct will have a subjective context too. + + (5) The subjective context. + + A subject has an additional interpretation of its credentials. A subset + of its credentials forms the 'subjective context'. The subjective context + is used as part of the security calculation that is carried out when a + subject acts. + + A Linux task, for example, has the FSUID, FSGID and the supplementary + group list for when it is acting upon a file - which are quite separate + from the real UID and GID that normally form the objective context of the + task. + + (6) Actions. + + Linux has a number of actions available that a subject may perform upon an + object. The set of actions available depends on the nature of the subject + and the object. + + Actions include reading, writing, creating and deleting files; forking or + signalling and tracing tasks. + + (7) Rules, access control lists and security calculations. + + When a subject acts upon an object, a security calculation is made. This + involves taking the subjective context, the objective context and the + action, and searching one or more sets of rules to see whether the subject + is granted or denied permission to act in the desired manner on the + object, given those contexts. + + There are two main sources of rules: + + (a) Discretionary access control (DAC): + + Sometimes the object will include sets of rules as part of its + description. This is an 'Access Control List' or 'ACL'. A Linux + file may supply more than one ACL. + + A traditional UNIX file, for example, includes a permissions mask that + is an abbreviated ACL with three fixed classes of subject ('user', + 'group' and 'other'), each of which may be granted certain privileges + ('read', 'write' and 'execute' - whatever those map to for the object + in question). UNIX file permissions do not allow the arbitrary + specification of subjects, however, and so are of limited use. + + A Linux file might also sport a POSIX ACL. This is a list of rules + that grants various permissions to arbitrary subjects. + + (b) Mandatory access control (MAC): + + The system as a whole may have one or more sets of rules that get + applied to all subjects and objects, regardless of their source. + SELinux and Smack are examples of this. + + In the case of SELinux and Smack, each object is given a label as part + of its credentials. When an action is requested, they take the + subject label, the object label and the action and look for a rule + that says that this action is either granted or denied. + + +==================== +TYPES OF CREDENTIALS +==================== + +The Linux kernel supports the following types of credentials: + + (1) Traditional UNIX credentials. + + Real User ID + Real Group ID + + The UID and GID are carried by most, if not all, Linux objects, even if in + some cases it has to be invented (FAT or CIFS files for example, which are + derived from Windows). These (mostly) define the objective context of + that object, with tasks being slightly different in some cases. + + Effective, Saved and FS User ID + Effective, Saved and FS Group ID + Supplementary groups + + These are additional credentials used by tasks only. Usually, an + EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID + will be used as the objective. For tasks, it should be noted that this is + not always true. + + (2) Capabilities. + + Set of permitted capabilities + Set of inheritable capabilities + Set of effective capabilities + Capability bounding set + + These are only carried by tasks. They indicate superior capabilities + granted piecemeal to a task that an ordinary task wouldn't otherwise have. + These are manipulated implicitly by changes to the traditional UNIX + credentials, but can also be manipulated directly by the capset() system + call. + + The permitted capabilities are those caps that the process might grant + itself to its effective or permitted sets through capset(). This + inheritable set might also be so constrained. + + The effective capabilities are the ones that a task is actually allowed to + make use of itself. + + The inheritable capabilities are the ones that may get passed across + execve(). + + The bounding set limits the capabilities that may be inherited across + execve(), especially when a binary is executed that will execute as UID 0. + + (3) Secure management flags (securebits). + + These are only carried by tasks. These govern the way the above + credentials are manipulated and inherited over certain operations such as + execve(). They aren't used directly as objective or subjective + credentials. + + (4) Keys and keyrings. + + These are only carried by tasks. They carry and cache security tokens + that don't fit into the other standard UNIX credentials. They are for + making such things as network filesystem keys available to the file + accesses performed by processes, without the necessity of ordinary + programs having to know about security details involved. + + Keyrings are a special type of key. They carry sets of other keys and can + be searched for the desired key. Each process may subscribe to a number + of keyrings: + + Per-thread keying + Per-process keyring + Per-session keyring + + When a process accesses a key, if not already present, it will normally be + cached on one of these keyrings for future accesses to find. + + For more information on using keys, see Documentation/security/keys.txt. + + (5) LSM + + The Linux Security Module allows extra controls to be placed over the + operations that a task may do. Currently Linux supports two main + alternate LSM options: SELinux and Smack. + + Both work by labelling the objects in a system and then applying sets of + rules (policies) that say what operations a task with one label may do to + an object with another label. + + (6) AF_KEY + + This is a socket-based approach to credential management for networking + stacks [RFC 2367]. It isn't discussed by this document as it doesn't + interact directly with task and file credentials; rather it keeps system + level credentials. + + +When a file is opened, part of the opening task's subjective context is +recorded in the file struct created. This allows operations using that file +struct to use those credentials instead of the subjective context of the task +that issued the operation. An example of this would be a file opened on a +network filesystem where the credentials of the opened file should be presented +to the server, regardless of who is actually doing a read or a write upon it. + + +============= +FILE MARKINGS +============= + +Files on disk or obtained over the network may have annotations that form the +objective security context of that file. Depending on the type of filesystem, +this may include one or more of the following: + + (*) UNIX UID, GID, mode; + + (*) Windows user ID; + + (*) Access control list; + + (*) LSM security label; + + (*) UNIX exec privilege escalation bits (SUID/SGID); + + (*) File capabilities exec privilege escalation bits. + +These are compared to the task's subjective security context, and certain +operations allowed or disallowed as a result. In the case of execve(), the +privilege escalation bits come into play, and may allow the resulting process +extra privileges, based on the annotations on the executable file. + + +================ +TASK CREDENTIALS +================ + +In Linux, all of a task's credentials are held in (uid, gid) or through +(groups, keys, LSM security) a refcounted structure of type 'struct cred'. +Each task points to its credentials by a pointer called 'cred' in its +task_struct. + +Once a set of credentials has been prepared and committed, it may not be +changed, barring the following exceptions: + + (1) its reference count may be changed; + + (2) the reference count on the group_info struct it points to may be changed; + + (3) the reference count on the security data it points to may be changed; + + (4) the reference count on any keyrings it points to may be changed; + + (5) any keyrings it points to may be revoked, expired or have their security + attributes changed; and + + (6) the contents of any keyrings to which it points may be changed (the whole + point of keyrings being a shared set of credentials, modifiable by anyone + with appropriate access). + +To alter anything in the cred struct, the copy-and-replace principle must be +adhered to. First take a copy, then alter the copy and then use RCU to change +the task pointer to make it point to the new copy. There are wrappers to aid +with this (see below). + +A task may only alter its _own_ credentials; it is no longer permitted for a +task to alter another's credentials. This means the capset() system call is no +longer permitted to take any PID other than the one of the current process. +Also keyctl_instantiate() and keyctl_negate() functions no longer permit +attachment to process-specific keyrings in the requesting process as the +instantiating process may need to create them. + + +IMMUTABLE CREDENTIALS +--------------------- + +Once a set of credentials has been made public (by calling commit_creds() for +example), it must be considered immutable, barring two exceptions: + + (1) The reference count may be altered. + + (2) Whilst the keyring subscriptions of a set of credentials may not be + changed, the keyrings subscribed to may have their contents altered. + +To catch accidental credential alteration at compile time, struct task_struct +has _const_ pointers to its credential sets, as does struct file. Furthermore, +certain functions such as get_cred() and put_cred() operate on const pointers, +thus rendering casts unnecessary, but require to temporarily ditch the const +qualification to be able to alter the reference count. + + +ACCESSING TASK CREDENTIALS +-------------------------- + +A task being able to alter only its own credentials permits the current process +to read or replace its own credentials without the need for any form of locking +- which simplifies things greatly. It can just call: + + const struct cred *current_cred() + +to get a pointer to its credentials structure, and it doesn't have to release +it afterwards. + +There are convenience wrappers for retrieving specific aspects of a task's +credentials (the value is simply returned in each case): + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + void *current_security(void) Current's LSM security pointer + struct user_struct *current_user(void) Current's user account + +There are also convenience wrappers for retrieving specific associated pairs of +a task's credentials: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +which return these pairs of values through their arguments after retrieving +them from the current task's credentials. + + +In addition, there is a function for obtaining a reference on the current +process's current set of credentials: + + const struct cred *get_current_cred(void); + +and functions for getting references to one of the credentials that don't +actually live in struct cred: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +which get references to the current process's user accounting structure and +supplementary groups list respectively. + +Once a reference has been obtained, it must be released with put_cred(), +free_uid() or put_group_info() as appropriate. + + +ACCESSING ANOTHER TASK'S CREDENTIALS +------------------------------------ + +Whilst a task may access its own credentials without the need for locking, the +same is not true of a task wanting to access another task's credentials. It +must use the RCU read lock and rcu_dereference(). + +The rcu_dereference() is wrapped by: + + const struct cred *__task_cred(struct task_struct *task); + +This should be used inside the RCU read lock, as in the following example: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +Should it be necessary to hold another task's credentials for a long period of +time, and possibly to sleep whilst doing so, then the caller should get a +reference on them using: + + const struct cred *get_task_cred(struct task_struct *task); + +This does all the RCU magic inside of it. The caller must call put_cred() on +the credentials so obtained when they're finished with. + + [*] Note: The result of __task_cred() should not be passed directly to + get_cred() as this may race with commit_cred(). + +There are a couple of convenience functions to access bits of another task's +credentials, hiding the RCU magic from the caller: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +If the caller is holding the RCU read lock at the time anyway, then: + + __task_cred(task)->uid + __task_cred(task)->euid + +should be used instead. Similarly, if multiple aspects of a task's credentials +need to be accessed, RCU read lock should be used, __task_cred() called, the +result stored in a temporary pointer and then the credential aspects called +from that before dropping the lock. This prevents the potentially expensive +RCU magic from being invoked multiple times. + +Should some other single aspect of another task's credentials need to be +accessed, then this can be used: + + task_cred_xxx(task, member) + +where 'member' is a non-pointer member of the cred struct. For instance: + + uid_t task_cred_xxx(task, suid); + +will retrieve 'struct cred::suid' from the task, doing the appropriate RCU +magic. This may not be used for pointer members as what they point to may +disappear the moment the RCU read lock is dropped. + + +ALTERING CREDENTIALS +-------------------- + +As previously mentioned, a task may only alter its own credentials, and may not +alter those of another task. This means that it doesn't need to use any +locking to alter its own credentials. + +To alter the current process's credentials, a function should first prepare a +new set of credentials by calling: + + struct cred *prepare_creds(void); + +this locks current->cred_replace_mutex and then allocates and constructs a +duplicate of the current process's credentials, returning with the mutex still +held if successful. It returns NULL if not successful (out of memory). + +The mutex prevents ptrace() from altering the ptrace state of a process whilst +security checks on credentials construction and changing is taking place as +the ptrace state may alter the outcome, particularly in the case of execve(). + +The new credentials set should be altered appropriately, and any security +checks and hooks done. Both the current and the proposed sets of credentials +are available for this purpose as current_cred() will return the current set +still at this point. + + +When the credential set is ready, it should be committed to the current process +by calling: + + int commit_creds(struct cred *new); + +This will alter various aspects of the credentials and the process, giving the +LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually +commit the new credentials to current->cred, it will release +current->cred_replace_mutex to allow ptrace() to take place, and it will notify +the scheduler and others of the changes. + +This function is guaranteed to return 0, so that it can be tail-called at the +end of such functions as sys_setresuid(). + +Note that this function consumes the caller's reference to the new credentials. +The caller should _not_ call put_cred() on the new credentials afterwards. + +Furthermore, once this function has been called on a new set of credentials, +those credentials may _not_ be changed further. + + +Should the security checks fail or some other error occur after prepare_creds() +has been called, then the following function should be invoked: + + void abort_creds(struct cred *new); + +This releases the lock on current->cred_replace_mutex that prepare_creds() got +and then releases the new credentials. + + +A typical credentials alteration function would look something like this: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +MANAGING CREDENTIALS +-------------------- + +There are some functions to help manage credentials: + + (*) void put_cred(const struct cred *cred); + + This releases a reference to the given set of credentials. If the + reference count reaches zero, the credentials will be scheduled for + destruction by the RCU system. + + (*) const struct cred *get_cred(const struct cred *cred); + + This gets a reference on a live set of credentials, returning a pointer to + that set of credentials. + + (*) struct cred *get_new_cred(struct cred *cred); + + This gets a reference on a set of credentials that is under construction + and is thus still mutable, returning a pointer to that set of credentials. + + +===================== +OPEN FILE CREDENTIALS +===================== + +When a new file is opened, a reference is obtained on the opening task's +credentials and this is attached to the file struct as 'f_cred' in place of +'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid +should now access file->f_cred->fsuid and file->f_cred->fsgid. + +It is safe to access f_cred without the use of RCU or locking because the +pointer will not change over the lifetime of the file struct, and nor will the +contents of the cred struct pointed to, barring the exceptions listed above +(see the Task Credentials section). + + +======================================= +OVERRIDING THE VFS'S USE OF CREDENTIALS +======================================= + +Under some circumstances it is desirable to override the credentials used by +the VFS, and that can be done by calling into such as vfs_mkdir() with a +different set of credentials. This is done in the following places: + + (*) sys_faccessat(). + + (*) do_coredump(). + + (*) nfs4recover.c. diff --git a/Documentation/security/keys-request-key.txt b/Documentation/security/keys-request-key.txt new file mode 100644 index 00000000000..51987bfecfe --- /dev/null +++ b/Documentation/security/keys-request-key.txt @@ -0,0 +1,202 @@ + =================== + KEY REQUEST SERVICE + =================== + +The key request service is part of the key retention service (refer to +Documentation/security/keys.txt). This document explains more fully how +the requesting algorithm works. + +The process starts by either the kernel requesting a service by calling +request_key*(): + + struct key *request_key(const struct key_type *type, + const char *description, + const char *callout_info); + +or: + + struct key *request_key_with_auxdata(const struct key_type *type, + const char *description, + const char *callout_info, + size_t callout_len, + void *aux); + +or: + + struct key *request_key_async(const struct key_type *type, + const char *description, + const char *callout_info, + size_t callout_len); + +or: + + struct key *request_key_async_with_auxdata(const struct key_type *type, + const char *description, + const char *callout_info, + size_t callout_len, + void *aux); + +Or by userspace invoking the request_key system call: + + key_serial_t request_key(const char *type, + const char *description, + const char *callout_info, + key_serial_t dest_keyring); + +The main difference between the access points is that the in-kernel interface +does not need to link the key to a keyring to prevent it from being immediately +destroyed. The kernel interface returns a pointer directly to the key, and +it's up to the caller to destroy the key. + +The request_key*_with_auxdata() calls are like the in-kernel request_key*() +calls, except that they permit auxiliary data to be passed to the upcaller (the +default is NULL). This is only useful for those key types that define their +own upcall mechanism rather than using /sbin/request-key. + +The two async in-kernel calls may return keys that are still in the process of +being constructed. The two non-async ones will wait for construction to +complete first. + +The userspace interface links the key to a keyring associated with the process +to prevent the key from going away, and returns the serial number of the key to +the caller. + + +The following example assumes that the key types involved don't define their +own upcall mechanisms. If they do, then those should be substituted for the +forking and execution of /sbin/request-key. + + +=========== +THE PROCESS +=========== + +A request proceeds in the following manner: + + (1) Process A calls request_key() [the userspace syscall calls the kernel + interface]. + + (2) request_key() searches the process's subscribed keyrings to see if there's + a suitable key there. If there is, it returns the key. If there isn't, + and callout_info is not set, an error is returned. Otherwise the process + proceeds to the next step. + + (3) request_key() sees that A doesn't have the desired key yet, so it creates + two things: + + (a) An uninstantiated key U of requested type and description. + + (b) An authorisation key V that refers to key U and notes that process A + is the context in which key U should be instantiated and secured, and + from which associated key requests may be satisfied. + + (4) request_key() then forks and executes /sbin/request-key with a new session + keyring that contains a link to auth key V. + + (5) /sbin/request-key assumes the authority associated with key U. + + (6) /sbin/request-key execs an appropriate program to perform the actual + instantiation. + + (7) The program may want to access another key from A's context (say a + Kerberos TGT key). It just requests the appropriate key, and the keyring + search notes that the session keyring has auth key V in its bottom level. + + This will permit it to then search the keyrings of process A with the + UID, GID, groups and security info of process A as if it was process A, + and come up with key W. + + (8) The program then does what it must to get the data with which to + instantiate key U, using key W as a reference (perhaps it contacts a + Kerberos server using the TGT) and then instantiates key U. + + (9) Upon instantiating key U, auth key V is automatically revoked so that it + may not be used again. + +(10) The program then exits 0 and request_key() deletes key V and returns key + U to the caller. + +This also extends further. If key W (step 7 above) didn't exist, key W would +be created uninstantiated, another auth key (X) would be created (as per step +3) and another copy of /sbin/request-key spawned (as per step 4); but the +context specified by auth key X will still be process A, as it was in auth key +V. + +This is because process A's keyrings can't simply be attached to +/sbin/request-key at the appropriate places because (a) execve will discard two +of them, and (b) it requires the same UID/GID/Groups all the way through. + + +==================================== +NEGATIVE INSTANTIATION AND REJECTION +==================================== + +Rather than instantiating a key, it is possible for the possessor of an +authorisation key to negatively instantiate a key that's under construction. +This is a short duration placeholder that causes any attempt at re-requesting +the key whilst it exists to fail with error ENOKEY if negated or the specified +error if rejected. + +This is provided to prevent excessive repeated spawning of /sbin/request-key +processes for a key that will never be obtainable. + +Should the /sbin/request-key process exit anything other than 0 or die on a +signal, the key under construction will be automatically negatively +instantiated for a short amount of time. + + +==================== +THE SEARCH ALGORITHM +==================== + +A search of any particular keyring proceeds in the following fashion: + + (1) When the key management code searches for a key (keyring_search_aux) it + firstly calls key_permission(SEARCH) on the keyring it's starting with, + if this denies permission, it doesn't search further. + + (2) It considers all the non-keyring keys within that keyring and, if any key + matches the criteria specified, calls key_permission(SEARCH) on it to see + if the key is allowed to be found. If it is, that key is returned; if + not, the search continues, and the error code is retained if of higher + priority than the one currently set. + + (3) It then considers all the keyring-type keys in the keyring it's currently + searching. It calls key_permission(SEARCH) on each keyring, and if this + grants permission, it recurses, executing steps (2) and (3) on that + keyring. + +The process stops immediately a valid key is found with permission granted to +use it. Any error from a previous match attempt is discarded and the key is +returned. + +When search_process_keyrings() is invoked, it performs the following searches +until one succeeds: + + (1) If extant, the process's thread keyring is searched. + + (2) If extant, the process's process keyring is searched. + + (3) The process's session keyring is searched. + + (4) If the process has assumed the authority associated with a request_key() + authorisation key then: + + (a) If extant, the calling process's thread keyring is searched. + + (b) If extant, the calling process's process keyring is searched. + + (c) The calling process's session keyring is searched. + +The moment one succeeds, all pending errors are discarded and the found key is +returned. + +Only if all these fail does the whole thing fail with the highest priority +error. Note that several errors may have come from LSM. + +The error priority is: + + EKEYREVOKED > EKEYEXPIRED > ENOKEY + +EACCES/EPERM are only returned on a direct search of a specific keyring where +the basal keyring does not grant Search permission. diff --git a/Documentation/security/keys-trusted-encrypted.txt b/Documentation/security/keys-trusted-encrypted.txt new file mode 100644 index 00000000000..8fb79bc1ac4 --- /dev/null +++ b/Documentation/security/keys-trusted-encrypted.txt @@ -0,0 +1,145 @@ + Trusted and Encrypted Keys + +Trusted and Encrypted Keys are two new key types added to the existing kernel +key ring service. Both of these new types are variable length symmetic keys, +and in both cases all keys are created in the kernel, and user space sees, +stores, and loads only encrypted blobs. Trusted Keys require the availability +of a Trusted Platform Module (TPM) chip for greater security, while Encrypted +Keys can be used on any system. All user level blobs, are displayed and loaded +in hex ascii for convenience, and are integrity verified. + +Trusted Keys use a TPM both to generate and to seal the keys. Keys are sealed +under a 2048 bit RSA key in the TPM, and optionally sealed to specified PCR +(integrity measurement) values, and only unsealed by the TPM, if PCRs and blob +integrity verifications match. A loaded Trusted Key can be updated with new +(future) PCR values, so keys are easily migrated to new pcr values, such as +when the kernel and initramfs are updated. The same key can have many saved +blobs under different PCR values, so multiple boots are easily supported. + +By default, trusted keys are sealed under the SRK, which has the default +authorization value (20 zeros). This can be set at takeownership time with the +trouser's utility: "tpm_takeownership -u -z". + +Usage: + keyctl add trusted name "new keylen [options]" ring + keyctl add trusted name "load hex_blob [pcrlock=pcrnum]" ring + keyctl update key "update [options]" + keyctl print keyid + + options: + keyhandle= ascii hex value of sealing key default 0x40000000 (SRK) + keyauth= ascii hex auth for sealing key default 0x00...i + (40 ascii zeros) + blobauth= ascii hex auth for sealed data default 0x00... + (40 ascii zeros) + blobauth= ascii hex auth for sealed data default 0x00... + (40 ascii zeros) + pcrinfo= ascii hex of PCR_INFO or PCR_INFO_LONG (no default) + pcrlock= pcr number to be extended to "lock" blob + migratable= 0|1 indicating permission to reseal to new PCR values, + default 1 (resealing allowed) + +"keyctl print" returns an ascii hex copy of the sealed key, which is in standard +TPM_STORED_DATA format. The key length for new keys are always in bytes. +Trusted Keys can be 32 - 128 bytes (256 - 1024 bits), the upper limit is to fit +within the 2048 bit SRK (RSA) keylength, with all necessary structure/padding. + +Encrypted keys do not depend on a TPM, and are faster, as they use AES for +encryption/decryption. New keys are created from kernel generated random +numbers, and are encrypted/decrypted using a specified 'master' key. The +'master' key can either be a trusted-key or user-key type. The main +disadvantage of encrypted keys is that if they are not rooted in a trusted key, +they are only as secure as the user key encrypting them. The master user key +should therefore be loaded in as secure a way as possible, preferably early in +boot. + +Usage: + keyctl add encrypted name "new key-type:master-key-name keylen" ring + keyctl add encrypted name "load hex_blob" ring + keyctl update keyid "update key-type:master-key-name" + +where 'key-type' is either 'trusted' or 'user'. + +Examples of trusted and encrypted key usage: + +Create and save a trusted key named "kmk" of length 32 bytes: + + $ keyctl add trusted kmk "new 32" @u + 440502848 + + $ keyctl show + Session Keyring + -3 --alswrv 500 500 keyring: _ses + 97833714 --alswrv 500 -1 \_ keyring: _uid.500 + 440502848 --alswrv 500 500 \_ trusted: kmk + + $ keyctl print 440502848 + 0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915 + 3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b + 27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722 + a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec + d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d + dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0 + f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b + e4a8aea2b607ec96931e6f4d4fe563ba + + $ keyctl pipe 440502848 > kmk.blob + +Load a trusted key from the saved blob: + + $ keyctl add trusted kmk "load `cat kmk.blob`" @u + 268728824 + + $ keyctl print 268728824 + 0101000000000000000001005d01b7e3f4a6be5709930f3b70a743cbb42e0cc95e18e915 + 3f60da455bbf1144ad12e4f92b452f966929f6105fd29ca28e4d4d5a031d068478bacb0b + 27351119f822911b0a11ba3d3498ba6a32e50dac7f32894dd890eb9ad578e4e292c83722 + a52e56a097e6a68b3f56f7a52ece0cdccba1eb62cad7d817f6dc58898b3ac15f36026fec + d568bd4a706cb60bb37be6d8f1240661199d640b66fb0fe3b079f97f450b9ef9c22c6d5d + dd379f0facd1cd020281dfa3c70ba21a3fa6fc2471dc6d13ecf8298b946f65345faa5ef0 + f1f8fff03ad0acb083725535636addb08d73dedb9832da198081e5deae84bfaf0409c22b + e4a8aea2b607ec96931e6f4d4fe563ba + +Reseal a trusted key under new pcr values: + + $ keyctl update 268728824 "update pcrinfo=`cat pcr.blob`" + $ keyctl print 268728824 + 010100000000002c0002800093c35a09b70fff26e7a98ae786c641e678ec6ffb6b46d805 + 77c8a6377aed9d3219c6dfec4b23ffe3000001005d37d472ac8a44023fbb3d18583a4f73 + d3a076c0858f6f1dcaa39ea0f119911ff03f5406df4f7f27f41da8d7194f45c9f4e00f2e + df449f266253aa3f52e55c53de147773e00f0f9aca86c64d94c95382265968c354c5eab4 + 9638c5ae99c89de1e0997242edfb0b501744e11ff9762dfd951cffd93227cc513384e7e6 + e782c29435c7ec2edafaa2f4c1fe6e7a781b59549ff5296371b42133777dcc5b8b971610 + 94bc67ede19e43ddb9dc2baacad374a36feaf0314d700af0a65c164b7082401740e489c9 + 7ef6a24defe4846104209bf0c3eced7fa1a672ed5b125fc9d8cd88b476a658a4434644ef + df8ae9a178e9f83ba9f08d10fa47e4226b98b0702f06b3b8 + +Create and save an encrypted key "evm" using the above trusted key "kmk": + + $ keyctl add encrypted evm "new trusted:kmk 32" @u + 159771175 + + $ keyctl print 159771175 + trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55 + be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64 + 5972dcb82ab2dde83376d82b2e3c09ffc + + $ keyctl pipe 159771175 > evm.blob + +Load an encrypted key "evm" from saved blob: + + $ keyctl add encrypted evm "load `cat evm.blob`" @u + 831684262 + + $ keyctl print 831684262 + trusted:kmk 32 2375725ad57798846a9bbd240de8906f006e66c03af53b1b382dbbc55 + be2a44616e4959430436dc4f2a7a9659aa60bb4652aeb2120f149ed197c564e024717c64 + 5972dcb82ab2dde83376d82b2e3c09ffc + + +The initial consumer of trusted keys is EVM, which at boot time needs a high +quality symmetric key for HMAC protection of file metadata. The use of a +trusted key provides strong guarantees that the EVM key has not been +compromised by a user level problem, and when sealed to specific boot PCR +values, protects against boot and offline attacks. Other uses for trusted and +encrypted keys, such as for disk and file encryption are anticipated. diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt new file mode 100644 index 00000000000..4d75931d2d7 --- /dev/null +++ b/Documentation/security/keys.txt @@ -0,0 +1,1290 @@ + ============================ + KERNEL KEY RETENTION SERVICE + ============================ + +This service allows cryptographic keys, authentication tokens, cross-domain +user mappings, and similar to be cached in the kernel for the use of +filesystems and other kernel services. + +Keyrings are permitted; these are a special type of key that can hold links to +other keys. Processes each have three standard keyring subscriptions that a +kernel service can search for relevant keys. + +The key service can be configured on by enabling: + + "Security options"/"Enable access key retention support" (CONFIG_KEYS) + +This document has the following sections: + + - Key overview + - Key service overview + - Key access permissions + - SELinux support + - New procfs files + - Userspace system call interface + - Kernel services + - Notes on accessing payload contents + - Defining a key type + - Request-key callback service + - Garbage collection + + +============ +KEY OVERVIEW +============ + +In this context, keys represent units of cryptographic data, authentication +tokens, keyrings, etc.. These are represented in the kernel by struct key. + +Each key has a number of attributes: + + - A serial number. + - A type. + - A description (for matching a key in a search). + - Access control information. + - An expiry time. + - A payload. + - State. + + + (*) Each key is issued a serial number of type key_serial_t that is unique for + the lifetime of that key. All serial numbers are positive non-zero 32-bit + integers. + + Userspace programs can use a key's serial numbers as a way to gain access + to it, subject to permission checking. + + (*) Each key is of a defined "type". Types must be registered inside the + kernel by a kernel service (such as a filesystem) before keys of that type + can be added or used. Userspace programs cannot define new types directly. + + Key types are represented in the kernel by struct key_type. This defines a + number of operations that can be performed on a key of that type. + + Should a type be removed from the system, all the keys of that type will + be invalidated. + + (*) Each key has a description. This should be a printable string. The key + type provides an operation to perform a match between the description on a + key and a criterion string. + + (*) Each key has an owner user ID, a group ID and a permissions mask. These + are used to control what a process may do to a key from userspace, and + whether a kernel service will be able to find the key. + + (*) Each key can be set to expire at a specific time by the key type's + instantiation function. Keys can also be immortal. + + (*) Each key can have a payload. This is a quantity of data that represent the + actual "key". In the case of a keyring, this is a list of keys to which + the keyring links; in the case of a user-defined key, it's an arbitrary + blob of data. + + Having a payload is not required; and the payload can, in fact, just be a + value stored in the struct key itself. + + When a key is instantiated, the key type's instantiation function is + called with a blob of data, and that then creates the key's payload in + some way. + + Similarly, when userspace wants to read back the contents of the key, if + permitted, another key type operation will be called to convert the key's + attached payload back into a blob of data. + + (*) Each key can be in one of a number of basic states: + + (*) Uninstantiated. The key exists, but does not have any data attached. + Keys being requested from userspace will be in this state. + + (*) Instantiated. This is the normal state. The key is fully formed, and + has data attached. + + (*) Negative. This is a relatively short-lived state. The key acts as a + note saying that a previous call out to userspace failed, and acts as + a throttle on key lookups. A negative key can be updated to a normal + state. + + (*) Expired. Keys can have lifetimes set. If their lifetime is exceeded, + they traverse to this state. An expired key can be updated back to a + normal state. + + (*) Revoked. A key is put in this state by userspace action. It can't be + found or operated upon (apart from by unlinking it). + + (*) Dead. The key's type was unregistered, and so the key is now useless. + +Keys in the last three states are subject to garbage collection. See the +section on "Garbage collection". + + +==================== +KEY SERVICE OVERVIEW +==================== + +The key service provides a number of features besides keys: + + (*) The key service defines two special key types: + + (+) "keyring" + + Keyrings are special keys that contain a list of other keys. Keyring + lists can be modified using various system calls. Keyrings should not + be given a payload when created. + + (+) "user" + + A key of this type has a description and a payload that are arbitrary + blobs of data. These can be created, updated and read by userspace, + and aren't intended for use by kernel services. + + (*) Each process subscribes to three keyrings: a thread-specific keyring, a + process-specific keyring, and a session-specific keyring. + + The thread-specific keyring is discarded from the child when any sort of + clone, fork, vfork or execve occurs. A new keyring is created only when + required. + + The process-specific keyring is replaced with an empty one in the child on + clone, fork, vfork unless CLONE_THREAD is supplied, in which case it is + shared. execve also discards the process's process keyring and creates a + new one. + + The session-specific keyring is persistent across clone, fork, vfork and + execve, even when the latter executes a set-UID or set-GID binary. A + process can, however, replace its current session keyring with a new one + by using PR_JOIN_SESSION_KEYRING. It is permitted to request an anonymous + new one, or to attempt to create or join one of a specific name. + + The ownership of the thread keyring changes when the real UID and GID of + the thread changes. + + (*) Each user ID resident in the system holds two special keyrings: a user + specific keyring and a default user session keyring. The default session + keyring is initialised with a link to the user-specific keyring. + + When a process changes its real UID, if it used to have no session key, it + will be subscribed to the default session key for the new UID. + + If a process attempts to access its session key when it doesn't have one, + it will be subscribed to the default for its current UID. + + (*) Each user has two quotas against which the keys they own are tracked. One + limits the total number of keys and keyrings, the other limits the total + amount of description and payload space that can be consumed. + + The user can view information on this and other statistics through procfs + files. The root user may also alter the quota limits through sysctl files + (see the section "New procfs files"). + + Process-specific and thread-specific keyrings are not counted towards a + user's quota. + + If a system call that modifies a key or keyring in some way would put the + user over quota, the operation is refused and error EDQUOT is returned. + + (*) There's a system call interface by which userspace programs can create and + manipulate keys and keyrings. + + (*) There's a kernel interface by which services can register types and search + for keys. + + (*) There's a way for the a search done from the kernel to call back to + userspace to request a key that can't be found in a process's keyrings. + + (*) An optional filesystem is available through which the key database can be + viewed and manipulated. + + +====================== +KEY ACCESS PERMISSIONS +====================== + +Keys have an owner user ID, a group access ID, and a permissions mask. The mask +has up to eight bits each for possessor, user, group and other access. Only +six of each set of eight bits are defined. These permissions granted are: + + (*) View + + This permits a key or keyring's attributes to be viewed - including key + type and description. + + (*) Read + + This permits a key's payload to be viewed or a keyring's list of linked + keys. + + (*) Write + + This permits a key's payload to be instantiated or updated, or it allows a + link to be added to or removed from a keyring. + + (*) Search + + This permits keyrings to be searched and keys to be found. Searches can + only recurse into nested keyrings that have search permission set. + + (*) Link + + This permits a key or keyring to be linked to. To create a link from a + keyring to a key, a process must have Write permission on the keyring and + Link permission on the key. + + (*) Set Attribute + + This permits a key's UID, GID and permissions mask to be changed. + +For changing the ownership, group ID or permissions mask, being the owner of +the key or having the sysadmin capability is sufficient. + + +=============== +SELINUX SUPPORT +=============== + +The security class "key" has been added to SELinux so that mandatory access +controls can be applied to keys created within various contexts. This support +is preliminary, and is likely to change quite significantly in the near future. +Currently, all of the basic permissions explained above are provided in SELinux +as well; SELinux is simply invoked after all basic permission checks have been +performed. + +The value of the file /proc/self/attr/keycreate influences the labeling of +newly-created keys. If the contents of that file correspond to an SELinux +security context, then the key will be assigned that context. Otherwise, the +key will be assigned the current context of the task that invoked the key +creation request. Tasks must be granted explicit permission to assign a +particular context to newly-created keys, using the "create" permission in the +key security class. + +The default keyrings associated with users will be labeled with the default +context of the user if and only if the login programs have been instrumented to +properly initialize keycreate during the login process. Otherwise, they will +be labeled with the context of the login program itself. + +Note, however, that the default keyrings associated with the root user are +labeled with the default kernel context, since they are created early in the +boot process, before root has a chance to log in. + +The keyrings associated with new threads are each labeled with the context of +their associated thread, and both session and process keyrings are handled +similarly. + + +================ +NEW PROCFS FILES +================ + +Two files have been added to procfs by which an administrator can find out +about the status of the key service: + + (*) /proc/keys + + This lists the keys that are currently viewable by the task reading the + file, giving information about their type, description and permissions. + It is not possible to view the payload of the key this way, though some + information about it may be given. + + The only keys included in the list are those that grant View permission to + the reading process whether or not it possesses them. Note that LSM + security checks are still performed, and may further filter out keys that + the current process is not authorised to view. + + The contents of the file look like this: + + SERIAL FLAGS USAGE EXPY PERM UID GID TYPE DESCRIPTION: SUMMARY + 00000001 I----- 39 perm 1f3f0000 0 0 keyring _uid_ses.0: 1/4 + 00000002 I----- 2 perm 1f3f0000 0 0 keyring _uid.0: empty + 00000007 I----- 1 perm 1f3f0000 0 0 keyring _pid.1: empty + 0000018d I----- 1 perm 1f3f0000 0 0 keyring _pid.412: empty + 000004d2 I--Q-- 1 perm 1f3f0000 32 -1 keyring _uid.32: 1/4 + 000004d3 I--Q-- 3 perm 1f3f0000 32 -1 keyring _uid_ses.32: empty + 00000892 I--QU- 1 perm 1f000000 0 0 user metal:copper: 0 + 00000893 I--Q-N 1 35s 1f3f0000 0 0 user metal:silver: 0 + 00000894 I--Q-- 1 10h 003f0000 0 0 user metal:gold: 0 + + The flags are: + + I Instantiated + R Revoked + D Dead + Q Contributes to user's quota + U Under construction by callback to userspace + N Negative key + + This file must be enabled at kernel configuration time as it allows anyone + to list the keys database. + + (*) /proc/key-users + + This file lists the tracking data for each user that has at least one key + on the system. Such data includes quota information and statistics: + + [root@andromeda root]# cat /proc/key-users + 0: 46 45/45 1/100 13/10000 + 29: 2 2/2 2/100 40/10000 + 32: 2 2/2 2/100 40/10000 + 38: 2 2/2 2/100 40/10000 + + The format of each line is + : User ID to which this applies + Structure refcount + / Total number of keys and number instantiated + / Key count quota + / Key size quota + + +Four new sysctl files have been added also for the purpose of controlling the +quota limits on keys: + + (*) /proc/sys/kernel/keys/root_maxkeys + /proc/sys/kernel/keys/root_maxbytes + + These files hold the maximum number of keys that root may have and the + maximum total number of bytes of data that root may have stored in those + keys. + + (*) /proc/sys/kernel/keys/maxkeys + /proc/sys/kernel/keys/maxbytes + + These files hold the maximum number of keys that each non-root user may + have and the maximum total number of bytes of data that each of those + users may have stored in their keys. + +Root may alter these by writing each new limit as a decimal number string to +the appropriate file. + + +=============================== +USERSPACE SYSTEM CALL INTERFACE +=============================== + +Userspace can manipulate keys directly through three new syscalls: add_key, +request_key and keyctl. The latter provides a number of functions for +manipulating keys. + +When referring to a key directly, userspace programs should use the key's +serial number (a positive 32-bit integer). However, there are some special +values available for referring to special keys and keyrings that relate to the +process making the call: + + CONSTANT VALUE KEY REFERENCED + ============================== ====== =========================== + KEY_SPEC_THREAD_KEYRING -1 thread-specific keyring + KEY_SPEC_PROCESS_KEYRING -2 process-specific keyring + KEY_SPEC_SESSION_KEYRING -3 session-specific keyring + KEY_SPEC_USER_KEYRING -4 UID-specific keyring + KEY_SPEC_USER_SESSION_KEYRING -5 UID-session keyring + KEY_SPEC_GROUP_KEYRING -6 GID-specific keyring + KEY_SPEC_REQKEY_AUTH_KEY -7 assumed request_key() + authorisation key + + +The main syscalls are: + + (*) Create a new key of given type, description and payload and add it to the + nominated keyring: + + key_serial_t add_key(const char *type, const char *desc, + const void *payload, size_t plen, + key_serial_t keyring); + + If a key of the same type and description as that proposed already exists + in the keyring, this will try to update it with the given payload, or it + will return error EEXIST if that function is not supported by the key + type. The process must also have permission to write to the key to be able + to update it. The new key will have all user permissions granted and no + group or third party permissions. + + Otherwise, this will attempt to create a new key of the specified type and + description, and to instantiate it with the supplied payload and attach it + to the keyring. In this case, an error will be generated if the process + does not have permission to write to the keyring. + + The payload is optional, and the pointer can be NULL if not required by + the type. The payload is plen in size, and plen can be zero for an empty + payload. + + A new keyring can be generated by setting type "keyring", the keyring name + as the description (or NULL) and setting the payload to NULL. + + User defined keys can be created by specifying type "user". It is + recommended that a user defined key's description by prefixed with a type + ID and a colon, such as "krb5tgt:" for a Kerberos 5 ticket granting + ticket. + + Any other type must have been registered with the kernel in advance by a + kernel service such as a filesystem. + + The ID of the new or updated key is returned if successful. + + + (*) Search the process's keyrings for a key, potentially calling out to + userspace to create it. + + key_serial_t request_key(const char *type, const char *description, + const char *callout_info, + key_serial_t dest_keyring); + + This function searches all the process's keyrings in the order thread, + process, session for a matching key. This works very much like + KEYCTL_SEARCH, including the optional attachment of the discovered key to + a keyring. + + If a key cannot be found, and if callout_info is not NULL, then + /sbin/request-key will be invoked in an attempt to obtain a key. The + callout_info string will be passed as an argument to the program. + + See also Documentation/security/keys-request-key.txt. + + +The keyctl syscall functions are: + + (*) Map a special key ID to a real key ID for this process: + + key_serial_t keyctl(KEYCTL_GET_KEYRING_ID, key_serial_t id, + int create); + + The special key specified by "id" is looked up (with the key being created + if necessary) and the ID of the key or keyring thus found is returned if + it exists. + + If the key does not yet exist, the key will be created if "create" is + non-zero; and the error ENOKEY will be returned if "create" is zero. + + + (*) Replace the session keyring this process subscribes to with a new one: + + key_serial_t keyctl(KEYCTL_JOIN_SESSION_KEYRING, const char *name); + + If name is NULL, an anonymous keyring is created attached to the process + as its session keyring, displacing the old session keyring. + + If name is not NULL, if a keyring of that name exists, the process + attempts to attach it as the session keyring, returning an error if that + is not permitted; otherwise a new keyring of that name is created and + attached as the session keyring. + + To attach to a named keyring, the keyring must have search permission for + the process's ownership. + + The ID of the new session keyring is returned if successful. + + + (*) Update the specified key: + + long keyctl(KEYCTL_UPDATE, key_serial_t key, const void *payload, + size_t plen); + + This will try to update the specified key with the given payload, or it + will return error EOPNOTSUPP if that function is not supported by the key + type. The process must also have permission to write to the key to be able + to update it. + + The payload is of length plen, and may be absent or empty as for + add_key(). + + + (*) Revoke a key: + + long keyctl(KEYCTL_REVOKE, key_serial_t key); + + This makes a key unavailable for further operations. Further attempts to + use the key will be met with error EKEYREVOKED, and the key will no longer + be findable. + + + (*) Change the ownership of a key: + + long keyctl(KEYCTL_CHOWN, key_serial_t key, uid_t uid, gid_t gid); + + This function permits a key's owner and group ID to be changed. Either one + of uid or gid can be set to -1 to suppress that change. + + Only the superuser can change a key's owner to something other than the + key's current owner. Similarly, only the superuser can change a key's + group ID to something other than the calling process's group ID or one of + its group list members. + + + (*) Change the permissions mask on a key: + + long keyctl(KEYCTL_SETPERM, key_serial_t key, key_perm_t perm); + + This function permits the owner of a key or the superuser to change the + permissions mask on a key. + + Only bits the available bits are permitted; if any other bits are set, + error EINVAL will be returned. + + + (*) Describe a key: + + long keyctl(KEYCTL_DESCRIBE, key_serial_t key, char *buffer, + size_t buflen); + + This function returns a summary of the key's attributes (but not its + payload data) as a string in the buffer provided. + + Unless there's an error, it always returns the amount of data it could + produce, even if that's too big for the buffer, but it won't copy more + than requested to userspace. If the buffer pointer is NULL then no copy + will take place. + + A process must have view permission on the key for this function to be + successful. + + If successful, a string is placed in the buffer in the following format: + + ;;;; + + Where type and description are strings, uid and gid are decimal, and perm + is hexadecimal. A NUL character is included at the end of the string if + the buffer is sufficiently big. + + This can be parsed with + + sscanf(buffer, "%[^;];%d;%d;%o;%s", type, &uid, &gid, &mode, desc); + + + (*) Clear out a keyring: + + long keyctl(KEYCTL_CLEAR, key_serial_t keyring); + + This function clears the list of keys attached to a keyring. The calling + process must have write permission on the keyring, and it must be a + keyring (or else error ENOTDIR will result). + + + (*) Link a key into a keyring: + + long keyctl(KEYCTL_LINK, key_serial_t keyring, key_serial_t key); + + This function creates a link from the keyring to the key. The process must + have write permission on the keyring and must have link permission on the + key. + + Should the keyring not be a keyring, error ENOTDIR will result; and if the + keyring is full, error ENFILE will result. + + The link procedure checks the nesting of the keyrings, returning ELOOP if + it appears too deep or EDEADLK if the link would introduce a cycle. + + Any links within the keyring to keys that match the new key in terms of + type and description will be discarded from the keyring as the new one is + added. + + + (*) Unlink a key or keyring from another keyring: + + long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key); + + This function looks through the keyring for the first link to the + specified key, and removes it if found. Subsequent links to that key are + ignored. The process must have write permission on the keyring. + + If the keyring is not a keyring, error ENOTDIR will result; and if the key + is not present, error ENOENT will be the result. + + + (*) Search a keyring tree for a key: + + key_serial_t keyctl(KEYCTL_SEARCH, key_serial_t keyring, + const char *type, const char *description, + key_serial_t dest_keyring); + + This searches the keyring tree headed by the specified keyring until a key + is found that matches the type and description criteria. Each keyring is + checked for keys before recursion into its children occurs. + + The process must have search permission on the top level keyring, or else + error EACCES will result. Only keyrings that the process has search + permission on will be recursed into, and only keys and keyrings for which + a process has search permission can be matched. If the specified keyring + is not a keyring, ENOTDIR will result. + + If the search succeeds, the function will attempt to link the found key + into the destination keyring if one is supplied (non-zero ID). All the + constraints applicable to KEYCTL_LINK apply in this case too. + + Error ENOKEY, EKEYREVOKED or EKEYEXPIRED will be returned if the search + fails. On success, the resulting key ID will be returned. + + + (*) Read the payload data from a key: + + long keyctl(KEYCTL_READ, key_serial_t keyring, char *buffer, + size_t buflen); + + This function attempts to read the payload data from the specified key + into the buffer. The process must have read permission on the key to + succeed. + + The returned data will be processed for presentation by the key type. For + instance, a keyring will return an array of key_serial_t entries + representing the IDs of all the keys to which it is subscribed. The user + defined key type will return its data as is. If a key type does not + implement this function, error EOPNOTSUPP will result. + + As much of the data as can be fitted into the buffer will be copied to + userspace if the buffer pointer is not NULL. + + On a successful return, the function will always return the amount of data + available rather than the amount copied. + + + (*) Instantiate a partially constructed key. + + long keyctl(KEYCTL_INSTANTIATE, key_serial_t key, + const void *payload, size_t plen, + key_serial_t keyring); + long keyctl(KEYCTL_INSTANTIATE_IOV, key_serial_t key, + const struct iovec *payload_iov, unsigned ioc, + key_serial_t keyring); + + If the kernel calls back to userspace to complete the instantiation of a + key, userspace should use this call to supply data for the key before the + invoked process returns, or else the key will be marked negative + automatically. + + The process must have write access on the key to be able to instantiate + it, and the key must be uninstantiated. + + If a keyring is specified (non-zero), the key will also be linked into + that keyring, however all the constraints applying in KEYCTL_LINK apply in + this case too. + + The payload and plen arguments describe the payload data as for add_key(). + + The payload_iov and ioc arguments describe the payload data in an iovec + array instead of a single buffer. + + + (*) Negatively instantiate a partially constructed key. + + long keyctl(KEYCTL_NEGATE, key_serial_t key, + unsigned timeout, key_serial_t keyring); + long keyctl(KEYCTL_REJECT, key_serial_t key, + unsigned timeout, unsigned error, key_serial_t keyring); + + If the kernel calls back to userspace to complete the instantiation of a + key, userspace should use this call mark the key as negative before the + invoked process returns if it is unable to fulfil the request. + + The process must have write access on the key to be able to instantiate + it, and the key must be uninstantiated. + + If a keyring is specified (non-zero), the key will also be linked into + that keyring, however all the constraints applying in KEYCTL_LINK apply in + this case too. + + If the key is rejected, future searches for it will return the specified + error code until the rejected key expires. Negating the key is the same + as rejecting the key with ENOKEY as the error code. + + + (*) Set the default request-key destination keyring. + + long keyctl(KEYCTL_SET_REQKEY_KEYRING, int reqkey_defl); + + This sets the default keyring to which implicitly requested keys will be + attached for this thread. reqkey_defl should be one of these constants: + + CONSTANT VALUE NEW DEFAULT KEYRING + ====================================== ====== ======================= + KEY_REQKEY_DEFL_NO_CHANGE -1 No change + KEY_REQKEY_DEFL_DEFAULT 0 Default[1] + KEY_REQKEY_DEFL_THREAD_KEYRING 1 Thread keyring + KEY_REQKEY_DEFL_PROCESS_KEYRING 2 Process keyring + KEY_REQKEY_DEFL_SESSION_KEYRING 3 Session keyring + KEY_REQKEY_DEFL_USER_KEYRING 4 User keyring + KEY_REQKEY_DEFL_USER_SESSION_KEYRING 5 User session keyring + KEY_REQKEY_DEFL_GROUP_KEYRING 6 Group keyring + + The old default will be returned if successful and error EINVAL will be + returned if reqkey_defl is not one of the above values. + + The default keyring can be overridden by the keyring indicated to the + request_key() system call. + + Note that this setting is inherited across fork/exec. + + [1] The default is: the thread keyring if there is one, otherwise + the process keyring if there is one, otherwise the session keyring if + there is one, otherwise the user default session keyring. + + + (*) Set the timeout on a key. + + long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout); + + This sets or clears the timeout on a key. The timeout can be 0 to clear + the timeout or a number of seconds to set the expiry time that far into + the future. + + The process must have attribute modification access on a key to set its + timeout. Timeouts may not be set with this function on negative, revoked + or expired keys. + + + (*) Assume the authority granted to instantiate a key + + long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key); + + This assumes or divests the authority required to instantiate the + specified key. Authority can only be assumed if the thread has the + authorisation key associated with the specified key in its keyrings + somewhere. + + Once authority is assumed, searches for keys will also search the + requester's keyrings using the requester's security label, UID, GID and + groups. + + If the requested authority is unavailable, error EPERM will be returned, + likewise if the authority has been revoked because the target key is + already instantiated. + + If the specified key is 0, then any assumed authority will be divested. + + The assumed authoritative key is inherited across fork and exec. + + + (*) Get the LSM security context attached to a key. + + long keyctl(KEYCTL_GET_SECURITY, key_serial_t key, char *buffer, + size_t buflen) + + This function returns a string that represents the LSM security context + attached to a key in the buffer provided. + + Unless there's an error, it always returns the amount of data it could + produce, even if that's too big for the buffer, but it won't copy more + than requested to userspace. If the buffer pointer is NULL then no copy + will take place. + + A NUL character is included at the end of the string if the buffer is + sufficiently big. This is included in the returned count. If no LSM is + in force then an empty string will be returned. + + A process must have view permission on the key for this function to be + successful. + + + (*) Install the calling process's session keyring on its parent. + + long keyctl(KEYCTL_SESSION_TO_PARENT); + + This functions attempts to install the calling process's session keyring + on to the calling process's parent, replacing the parent's current session + keyring. + + The calling process must have the same ownership as its parent, the + keyring must have the same ownership as the calling process, the calling + process must have LINK permission on the keyring and the active LSM module + mustn't deny permission, otherwise error EPERM will be returned. + + Error ENOMEM will be returned if there was insufficient memory to complete + the operation, otherwise 0 will be returned to indicate success. + + The keyring will be replaced next time the parent process leaves the + kernel and resumes executing userspace. + + +=============== +KERNEL SERVICES +=============== + +The kernel services for key management are fairly simple to deal with. They can +be broken down into two areas: keys and key types. + +Dealing with keys is fairly straightforward. Firstly, the kernel service +registers its type, then it searches for a key of that type. It should retain +the key as long as it has need of it, and then it should release it. For a +filesystem or device file, a search would probably be performed during the open +call, and the key released upon close. How to deal with conflicting keys due to +two different users opening the same file is left to the filesystem author to +solve. + +To access the key manager, the following header must be #included: + + + +Specific key types should have a header file under include/keys/ that should be +used to access that type. For keys of type "user", for example, that would be: + + + +Note that there are two different types of pointers to keys that may be +encountered: + + (*) struct key * + + This simply points to the key structure itself. Key structures will be at + least four-byte aligned. + + (*) key_ref_t + + This is equivalent to a struct key *, but the least significant bit is set + if the caller "possesses" the key. By "possession" it is meant that the + calling processes has a searchable link to the key from one of its + keyrings. There are three functions for dealing with these: + + key_ref_t make_key_ref(const struct key *key, + unsigned long possession); + + struct key *key_ref_to_ptr(const key_ref_t key_ref); + + unsigned long is_key_possessed(const key_ref_t key_ref); + + The first function constructs a key reference from a key pointer and + possession information (which must be 0 or 1 and not any other value). + + The second function retrieves the key pointer from a reference and the + third retrieves the possession flag. + +When accessing a key's payload contents, certain precautions must be taken to +prevent access vs modification races. See the section "Notes on accessing +payload contents" for more information. + +(*) To search for a key, call: + + struct key *request_key(const struct key_type *type, + const char *description, + const char *callout_info); + + This is used to request a key or keyring with a description that matches + the description specified according to the key type's match function. This + permits approximate matching to occur. If callout_string is not NULL, then + /sbin/request-key will be invoked in an attempt to obtain the key from + userspace. In that case, callout_string will be passed as an argument to + the program. + + Should the function fail error ENOKEY, EKEYEXPIRED or EKEYREVOKED will be + returned. + + If successful, the key will have been attached to the default keyring for + implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING. + + See also Documentation/security/keys-request-key.txt. + + +(*) To search for a key, passing auxiliary data to the upcaller, call: + + struct key *request_key_with_auxdata(const struct key_type *type, + const char *description, + const void *callout_info, + size_t callout_len, + void *aux); + + This is identical to request_key(), except that the auxiliary data is + passed to the key_type->request_key() op if it exists, and the callout_info + is a blob of length callout_len, if given (the length may be 0). + + +(*) A key can be requested asynchronously by calling one of: + + struct key *request_key_async(const struct key_type *type, + const char *description, + const void *callout_info, + size_t callout_len); + + or: + + struct key *request_key_async_with_auxdata(const struct key_type *type, + const char *description, + const char *callout_info, + size_t callout_len, + void *aux); + + which are asynchronous equivalents of request_key() and + request_key_with_auxdata() respectively. + + These two functions return with the key potentially still under + construction. To wait for construction completion, the following should be + called: + + int wait_for_key_construction(struct key *key, bool intr); + + The function will wait for the key to finish being constructed and then + invokes key_validate() to return an appropriate value to indicate the state + of the key (0 indicates the key is usable). + + If intr is true, then the wait can be interrupted by a signal, in which + case error ERESTARTSYS will be returned. + + +(*) When it is no longer required, the key should be released using: + + void key_put(struct key *key); + + Or: + + void key_ref_put(key_ref_t key_ref); + + These can be called from interrupt context. If CONFIG_KEYS is not set then + the argument will not be parsed. + + +(*) Extra references can be made to a key by calling the following function: + + struct key *key_get(struct key *key); + + These need to be disposed of by calling key_put() when they've been + finished with. The key pointer passed in will be returned. If the pointer + is NULL or CONFIG_KEYS is not set then the key will not be dereferenced and + no increment will take place. + + +(*) A key's serial number can be obtained by calling: + + key_serial_t key_serial(struct key *key); + + If key is NULL or if CONFIG_KEYS is not set then 0 will be returned (in the + latter case without parsing the argument). + + +(*) If a keyring was found in the search, this can be further searched by: + + key_ref_t keyring_search(key_ref_t keyring_ref, + const struct key_type *type, + const char *description) + + This searches the keyring tree specified for a matching key. Error ENOKEY + is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful, + the returned key will need to be released. + + The possession attribute from the keyring reference is used to control + access through the permissions mask and is propagated to the returned key + reference pointer if successful. + + +(*) To check the validity of a key, this function can be called: + + int validate_key(struct key *key); + + This checks that the key in question hasn't expired or and hasn't been + revoked. Should the key be invalid, error EKEYEXPIRED or EKEYREVOKED will + be returned. If the key is NULL or if CONFIG_KEYS is not set then 0 will be + returned (in the latter case without parsing the argument). + + +(*) To register a key type, the following function should be called: + + int register_key_type(struct key_type *type); + + This will return error EEXIST if a type of the same name is already + present. + + +(*) To unregister a key type, call: + + void unregister_key_type(struct key_type *type); + + +Under some circumstances, it may be desirable to deal with a bundle of keys. +The facility provides access to the keyring type for managing such a bundle: + + struct key_type key_type_keyring; + +This can be used with a function such as request_key() to find a specific +keyring in a process's keyrings. A keyring thus found can then be searched +with keyring_search(). Note that it is not possible to use request_key() to +search a specific keyring, so using keyrings in this way is of limited utility. + + +=================================== +NOTES ON ACCESSING PAYLOAD CONTENTS +=================================== + +The simplest payload is just a number in key->payload.value. In this case, +there's no need to indulge in RCU or locking when accessing the payload. + +More complex payload contents must be allocated and a pointer to them set in +key->payload.data. One of the following ways must be selected to access the +data: + + (1) Unmodifiable key type. + + If the key type does not have a modify method, then the key's payload can + be accessed without any form of locking, provided that it's known to be + instantiated (uninstantiated keys cannot be "found"). + + (2) The key's semaphore. + + The semaphore could be used to govern access to the payload and to control + the payload pointer. It must be write-locked for modifications and would + have to be read-locked for general access. The disadvantage of doing this + is that the accessor may be required to sleep. + + (3) RCU. + + RCU must be used when the semaphore isn't already held; if the semaphore + is held then the contents can't change under you unexpectedly as the + semaphore must still be used to serialise modifications to the key. The + key management code takes care of this for the key type. + + However, this means using: + + rcu_read_lock() ... rcu_dereference() ... rcu_read_unlock() + + to read the pointer, and: + + rcu_dereference() ... rcu_assign_pointer() ... call_rcu() + + to set the pointer and dispose of the old contents after a grace period. + Note that only the key type should ever modify a key's payload. + + Furthermore, an RCU controlled payload must hold a struct rcu_head for the + use of call_rcu() and, if the payload is of variable size, the length of + the payload. key->datalen cannot be relied upon to be consistent with the + payload just dereferenced if the key's semaphore is not held. + + +=================== +DEFINING A KEY TYPE +=================== + +A kernel service may want to define its own key type. For instance, an AFS +filesystem might want to define a Kerberos 5 ticket key type. To do this, it +author fills in a key_type struct and registers it with the system. + +Source files that implement key types should include the following header file: + + + +The structure has a number of fields, some of which are mandatory: + + (*) const char *name + + The name of the key type. This is used to translate a key type name + supplied by userspace into a pointer to the structure. + + + (*) size_t def_datalen + + This is optional - it supplies the default payload data length as + contributed to the quota. If the key type's payload is always or almost + always the same size, then this is a more efficient way to do things. + + The data length (and quota) on a particular key can always be changed + during instantiation or update by calling: + + int key_payload_reserve(struct key *key, size_t datalen); + + With the revised data length. Error EDQUOT will be returned if this is not + viable. + + + (*) int (*vet_description)(const char *description); + + This optional method is called to vet a key description. If the key type + doesn't approve of the key description, it may return an error, otherwise + it should return 0. + + + (*) int (*instantiate)(struct key *key, const void *data, size_t datalen); + + This method is called to attach a payload to a key during construction. + The payload attached need not bear any relation to the data passed to this + function. + + If the amount of data attached to the key differs from the size in + keytype->def_datalen, then key_payload_reserve() should be called. + + This method does not have to lock the key in order to attach a payload. + The fact that KEY_FLAG_INSTANTIATED is not set in key->flags prevents + anything else from gaining access to the key. + + It is safe to sleep in this method. + + + (*) int (*update)(struct key *key, const void *data, size_t datalen); + + If this type of key can be updated, then this method should be provided. + It is called to update a key's payload from the blob of data provided. + + key_payload_reserve() should be called if the data length might change + before any changes are actually made. Note that if this succeeds, the type + is committed to changing the key because it's already been altered, so all + memory allocation must be done first. + + The key will have its semaphore write-locked before this method is called, + but this only deters other writers; any changes to the key's payload must + be made under RCU conditions, and call_rcu() must be used to dispose of + the old payload. + + key_payload_reserve() should be called before the changes are made, but + after all allocations and other potentially failing function calls are + made. + + It is safe to sleep in this method. + + + (*) int (*match)(const struct key *key, const void *desc); + + This method is called to match a key against a description. It should + return non-zero if the two match, zero if they don't. + + This method should not need to lock the key in any way. The type and + description can be considered invariant, and the payload should not be + accessed (the key may not yet be instantiated). + + It is not safe to sleep in this method; the caller may hold spinlocks. + + + (*) void (*revoke)(struct key *key); + + This method is optional. It is called to discard part of the payload + data upon a key being revoked. The caller will have the key semaphore + write-locked. + + It is safe to sleep in this method, though care should be taken to avoid + a deadlock against the key semaphore. + + + (*) void (*destroy)(struct key *key); + + This method is optional. It is called to discard the payload data on a key + when it is being destroyed. + + This method does not need to lock the key to access the payload; it can + consider the key as being inaccessible at this time. Note that the key's + type may have been changed before this function is called. + + It is not safe to sleep in this method; the caller may hold spinlocks. + + + (*) void (*describe)(const struct key *key, struct seq_file *p); + + This method is optional. It is called during /proc/keys reading to + summarise a key's description and payload in text form. + + This method will be called with the RCU read lock held. rcu_dereference() + should be used to read the payload pointer if the payload is to be + accessed. key->datalen cannot be trusted to stay consistent with the + contents of the payload. + + The description will not change, though the key's state may. + + It is not safe to sleep in this method; the RCU read lock is held by the + caller. + + + (*) long (*read)(const struct key *key, char __user *buffer, size_t buflen); + + This method is optional. It is called by KEYCTL_READ to translate the + key's payload into something a blob of data for userspace to deal with. + Ideally, the blob should be in the same format as that passed in to the + instantiate and update methods. + + If successful, the blob size that could be produced should be returned + rather than the size copied. + + This method will be called with the key's semaphore read-locked. This will + prevent the key's payload changing. It is not necessary to use RCU locking + when accessing the key's payload. It is safe to sleep in this method, such + as might happen when the userspace buffer is accessed. + + + (*) int (*request_key)(struct key_construction *cons, const char *op, + void *aux); + + This method is optional. If provided, request_key() and friends will + invoke this function rather than upcalling to /sbin/request-key to operate + upon a key of this type. + + The aux parameter is as passed to request_key_async_with_auxdata() and + similar or is NULL otherwise. Also passed are the construction record for + the key to be operated upon and the operation type (currently only + "create"). + + This method is permitted to return before the upcall is complete, but the + following function must be called under all circumstances to complete the + instantiation process, whether or not it succeeds, whether or not there's + an error: + + void complete_request_key(struct key_construction *cons, int error); + + The error parameter should be 0 on success, -ve on error. The + construction record is destroyed by this action and the authorisation key + will be revoked. If an error is indicated, the key under construction + will be negatively instantiated if it wasn't already instantiated. + + If this method returns an error, that error will be returned to the + caller of request_key*(). complete_request_key() must be called prior to + returning. + + The key under construction and the authorisation key can be found in the + key_construction struct pointed to by cons: + + (*) struct key *key; + + The key under construction. + + (*) struct key *authkey; + + The authorisation key. + + +============================ +REQUEST-KEY CALLBACK SERVICE +============================ + +To create a new key, the kernel will attempt to execute the following command +line: + + /sbin/request-key create \ + + + is the key being constructed, and the three keyrings are the process +keyrings from the process that caused the search to be issued. These are +included for two reasons: + + (1) There may be an authentication token in one of the keyrings that is + required to obtain the key, eg: a Kerberos Ticket-Granting Ticket. + + (2) The new key should probably be cached in one of these rings. + +This program should set it UID and GID to those specified before attempting to +access any more keys. It may then look around for a user specific process to +hand the request off to (perhaps a path held in placed in another key by, for +example, the KDE desktop manager). + +The program (or whatever it calls) should finish construction of the key by +calling KEYCTL_INSTANTIATE or KEYCTL_INSTANTIATE_IOV, which also permits it to +cache the key in one of the keyrings (probably the session ring) before +returning. Alternatively, the key can be marked as negative with KEYCTL_NEGATE +or KEYCTL_REJECT; this also permits the key to be cached in one of the +keyrings. + +If it returns with the key remaining in the unconstructed state, the key will +be marked as being negative, it will be added to the session keyring, and an +error will be returned to the key requestor. + +Supplementary information may be provided from whoever or whatever invoked this +service. This will be passed as the parameter. If no such +information was made available, then "-" will be passed as this parameter +instead. + + +Similarly, the kernel may attempt to update an expired or a soon to expire key +by executing: + + /sbin/request-key update \ + + +In this case, the program isn't required to actually attach the key to a ring; +the rings are provided for reference. + + +================== +GARBAGE COLLECTION +================== + +Dead keys (for which the type has been removed) will be automatically unlinked +from those keyrings that point to them and deleted as soon as possible by a +background garbage collector. + +Similarly, revoked and expired keys will be garbage collected, but only after a +certain amount of time has passed. This time is set as a number of seconds in: + + /proc/sys/kernel/keys/gc_delay diff --git a/Documentation/security/tomoyo.txt b/Documentation/security/tomoyo.txt new file mode 100644 index 00000000000..200a2d37cbc --- /dev/null +++ b/Documentation/security/tomoyo.txt @@ -0,0 +1,55 @@ +--- What is TOMOYO? --- + +TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. + +LiveCD-based tutorials are available at +http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/ +http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/ . +Though these tutorials use non-LSM version of TOMOYO, they are useful for you +to know what TOMOYO is. + +--- How to enable TOMOYO? --- + +Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on +kernel's command line. + +Please see http://tomoyo.sourceforge.jp/2.3/ for details. + +--- Where is documentation? --- + +User <-> Kernel interface documentation is available at +http://tomoyo.sourceforge.jp/2.3/policy-reference.html . + +Materials we prepared for seminars and symposiums are available at +http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . +Below lists are chosen from three aspects. + +What is TOMOYO? + TOMOYO Linux Overview + http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf + TOMOYO Linux: pragmatic and manageable security for Linux + http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf + TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box + http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf + +What can TOMOYO do? + Deep inside TOMOYO Linux + http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf + The role of "pathname based access control" in security. + http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf + +History of TOMOYO? + Realities of Mainlining + http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf + +--- What is future plan? --- + +We believe that inode based security and name based security are complementary +and both should be used together. But unfortunately, so far, we cannot enable +multiple LSM modules at the same time. We feel sorry that you have to give up +SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. + +We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM +version of TOMOYO, available at http://tomoyo.sourceforge.jp/1.7/ . +LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning +to port non-LSM version's functionalities to LSM versions. diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt deleted file mode 100644 index 200a2d37cbc..00000000000 --- a/Documentation/tomoyo.txt +++ /dev/null @@ -1,55 +0,0 @@ ---- What is TOMOYO? --- - -TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. - -LiveCD-based tutorials are available at -http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/ -http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/ . -Though these tutorials use non-LSM version of TOMOYO, they are useful for you -to know what TOMOYO is. - ---- How to enable TOMOYO? --- - -Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on -kernel's command line. - -Please see http://tomoyo.sourceforge.jp/2.3/ for details. - ---- Where is documentation? --- - -User <-> Kernel interface documentation is available at -http://tomoyo.sourceforge.jp/2.3/policy-reference.html . - -Materials we prepared for seminars and symposiums are available at -http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . -Below lists are chosen from three aspects. - -What is TOMOYO? - TOMOYO Linux Overview - http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf - TOMOYO Linux: pragmatic and manageable security for Linux - http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf - TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box - http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf - -What can TOMOYO do? - Deep inside TOMOYO Linux - http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf - The role of "pathname based access control" in security. - http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf - -History of TOMOYO? - Realities of Mainlining - http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf - ---- What is future plan? --- - -We believe that inode based security and name based security are complementary -and both should be used together. But unfortunately, so far, we cannot enable -multiple LSM modules at the same time. We feel sorry that you have to give up -SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. - -We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM -version of TOMOYO, available at http://tomoyo.sourceforge.jp/1.7/ . -LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning -to port non-LSM version's functionalities to LSM versions. diff --git a/MAINTAINERS b/MAINTAINERS index 69f19f10314..3fa170ba5f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3705,7 +3705,7 @@ KEYS/KEYRINGS: M: David Howells L: keyrings@linux-nfs.org S: Maintained -F: Documentation/keys.txt +F: Documentation/security/keys.txt F: include/linux/key.h F: include/linux/key-type.h F: include/keys/ @@ -3717,7 +3717,7 @@ M: Mimi Zohar L: linux-security-module@vger.kernel.org L: keyrings@linux-nfs.org S: Supported -F: Documentation/keys-trusted-encrypted.txt +F: Documentation/security/keys-trusted-encrypted.txt F: include/keys/trusted-type.h F: security/keys/trusted.c F: security/keys/trusted.h @@ -3728,7 +3728,7 @@ M: David Safford L: linux-security-module@vger.kernel.org L: keyrings@linux-nfs.org S: Supported -F: Documentation/keys-trusted-encrypted.txt +F: Documentation/security/keys-trusted-encrypted.txt F: include/keys/encrypted-type.h F: security/keys/encrypted.c F: security/keys/encrypted.h diff --git a/include/linux/cred.h b/include/linux/cred.h index be16b61283c..82607992f30 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -1,4 +1,4 @@ -/* Credentials management - see Documentation/credentials.txt +/* Credentials management - see Documentation/security/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/include/linux/key.h b/include/linux/key.h index b2bb0171956..303982a6993 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. * * - * See Documentation/keys.txt for information on keys/keyrings. + * See Documentation/security/keys.txt for information on keys/keyrings. */ #ifndef _LINUX_KEY_H diff --git a/kernel/cred.c b/kernel/cred.c index 8093c16b84b..004e3679624 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/credentials.txt +/* Task credentials management - see Documentation/security/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/scripts/selinux/README b/scripts/selinux/README index a936315ba2c..4d020ecb752 100644 --- a/scripts/selinux/README +++ b/scripts/selinux/README @@ -1,2 +1,2 @@ -Please see Documentation/SELinux.txt for information on +Please see Documentation/security/SELinux.txt for information on installing a dummy SELinux policy. diff --git a/security/apparmor/match.c b/security/apparmor/match.c index 06d764ccbbe..94de6b4907c 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -194,7 +194,7 @@ void aa_dfa_free_kref(struct kref *kref) * @flags: flags controlling what type of accept tables are acceptable * * Unpack a dfa that has been serialized. To find information on the dfa - * format look in Documentation/apparmor.txt + * format look in Documentation/security/apparmor.txt * Assumes the dfa @blob stream has been aligned on a 8 byte boundary * * Returns: an unpacked dfa ready for matching or ERR_PTR on failure diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index e33aaf7e574..d6d9a57b565 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -12,8 +12,8 @@ * published by the Free Software Foundation, version 2 of the * License. * - * AppArmor uses a serialized binary format for loading policy. - * To find policy format documentation look in Documentation/apparmor.txt + * AppArmor uses a serialized binary format for loading policy. To find + * policy format documentation look in Documentation/security/apparmor.txt * All policy is validated before it is used. */ diff --git a/security/keys/encrypted.c b/security/keys/encrypted.c index 69907a58a68..b1cba5bf0a5 100644 --- a/security/keys/encrypted.c +++ b/security/keys/encrypted.c @@ -8,7 +8,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2 of the License. * - * See Documentation/keys-trusted-encrypted.txt + * See Documentation/security/keys-trusted-encrypted.txt */ #include diff --git a/security/keys/request_key.c b/security/keys/request_key.c index df3c0417ee4..d41cc153a31 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * See Documentation/keys-request-key.txt + * See Documentation/security/keys-request-key.txt */ #include diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 68164031a74..3c0cfdec6e3 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * See Documentation/keys-request-key.txt + * See Documentation/security/keys-request-key.txt */ #include diff --git a/security/keys/trusted.c b/security/keys/trusted.c index c99b9368368..0c33e2ea1f3 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -8,7 +8,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2 of the License. * - * See Documentation/keys-trusted-encrypted.txt + * See Documentation/security/keys-trusted-encrypted.txt */ #include -- cgit v1.2.3-70-g09d2 From 571d76acdab95876aeff869ab6449f826c23aa43 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 16 May 2011 14:23:44 -0400 Subject: arch/tile: support signal "exception-trace" hook This change adds support for /proc/sys/debug/exception-trace to tile. Like x86 and sparc, by default it is set to "1", generating a one-line printk whenever a user process crashes. By setting it to "2", we get a much more complete userspace diagnostic at crash time, including a user-space backtrace, register dump, and memory dump around the address of the crash. Some vestiges of the Tilera-internal version of this support are removed with this patch (the show_crashinfo variable and the arch_coredump_signal function). We retain a "crashinfo" boot parameter which allows you to set the boot-time value of exception-trace. Signed-off-by: Chris Metcalf --- arch/tile/include/asm/processor.h | 7 --- arch/tile/include/asm/signal.h | 4 ++ arch/tile/kernel/compat_signal.c | 4 +- arch/tile/kernel/signal.c | 128 ++++++++++++++++++++++++++++++++++++-- arch/tile/kernel/single_step.c | 4 ++ arch/tile/kernel/traps.c | 1 + arch/tile/mm/fault.c | 24 ++++--- kernel/sysctl.c | 2 +- 8 files changed, 151 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index d6b43ddfcc0..34c1e01ffb5 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -257,10 +257,6 @@ static inline void cpu_relax(void) barrier(); } -struct siginfo; -extern void arch_coredump_signal(struct siginfo *, struct pt_regs *); -#define arch_coredump_signal arch_coredump_signal - /* Info on this processor (see fs/proc/cpuinfo.c) */ struct seq_operations; extern const struct seq_operations cpuinfo_op; @@ -271,9 +267,6 @@ extern char chip_model[64]; /* Data on which physical memory controller corresponds to which NUMA node. */ extern int node_controller[]; -/* Do we dump information to the console when a user application crashes? */ -extern int show_crashinfo; - #if CHIP_HAS_CBOX_HOME_MAP() /* Does the heap allocator return hash-for-home pages by default? */ extern int hash_default; diff --git a/arch/tile/include/asm/signal.h b/arch/tile/include/asm/signal.h index 81d92a45cd4..1e1e616783e 100644 --- a/arch/tile/include/asm/signal.h +++ b/arch/tile/include/asm/signal.h @@ -28,6 +28,10 @@ struct pt_regs; int restore_sigcontext(struct pt_regs *, struct sigcontext __user *); int setup_sigcontext(struct sigcontext __user *, struct pt_regs *); void do_signal(struct pt_regs *regs); +void signal_fault(const char *type, struct pt_regs *, + void __user *frame, int sig); +void trace_unhandled_signal(const char *type, struct pt_regs *regs, + unsigned long address, int signo); #endif #endif /* _ASM_TILE_SIGNAL_H */ diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index dbb0dfc7bec..a7869ad6277 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -317,7 +317,7 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs) return 0; badframe: - force_sig(SIGSEGV, current); + signal_fault("bad sigreturn frame", regs, frame, 0); return 0; } @@ -431,6 +431,6 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; give_sigsegv: - force_sigsegv(sig, current); + signal_fault("bad setup frame", regs, frame, sig); return -EFAULT; } diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 1260321155f..bedaf4e9f3a 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -39,7 +39,6 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss, stack_t __user *, uoss, struct pt_regs *, regs) { @@ -78,6 +77,13 @@ int restore_sigcontext(struct pt_regs *regs, return err; } +void signal_fault(const char *type, struct pt_regs *regs, + void __user *frame, int sig) +{ + trace_unhandled_signal(type, regs, (unsigned long)frame, SIGSEGV); + force_sigsegv(sig, current); +} + /* The assembly shim for this function arranges to ignore the return value. */ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) { @@ -105,7 +111,7 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) return 0; badframe: - force_sig(SIGSEGV, current); + signal_fault("bad sigreturn frame", regs, frame, 0); return 0; } @@ -231,7 +237,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; give_sigsegv: - force_sigsegv(sig, current); + signal_fault("bad setup frame", regs, frame, sig); return -EFAULT; } @@ -245,7 +251,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, { int ret; - /* Are we from a system call? */ if (regs->faultnum == INT_SWINT_1) { /* If so, check system call restarting.. */ @@ -363,3 +368,118 @@ done: /* Avoid double syscall restart if there are nested signals. */ regs->faultnum = INT_SWINT_1_SIGRETURN; } + +int show_unhandled_signals = 1; + +static int __init crashinfo(char *str) +{ + unsigned long val; + const char *word; + + if (*str == '\0') + val = 2; + else if (*str != '=' || strict_strtoul(++str, 0, &val) != 0) + return 0; + show_unhandled_signals = val; + switch (show_unhandled_signals) { + case 0: + word = "No"; + break; + case 1: + word = "One-line"; + break; + default: + word = "Detailed"; + break; + } + pr_info("%s crash reports will be generated on the console\n", word); + return 1; +} +__setup("crashinfo", crashinfo); + +static void dump_mem(void __user *address) +{ + void __user *addr; + enum { region_size = 256, bytes_per_line = 16 }; + int i, j, k; + int found_readable_mem = 0; + + pr_err("\n"); + if (!access_ok(VERIFY_READ, address, 1)) { + pr_err("Not dumping at address 0x%lx (kernel address)\n", + (unsigned long)address); + return; + } + + addr = (void __user *) + (((unsigned long)address & -bytes_per_line) - region_size/2); + if (addr > address) + addr = NULL; + for (i = 0; i < region_size; + addr += bytes_per_line, i += bytes_per_line) { + unsigned char buf[bytes_per_line]; + char line[100]; + if (copy_from_user(buf, addr, bytes_per_line)) + continue; + if (!found_readable_mem) { + pr_err("Dumping memory around address 0x%lx:\n", + (unsigned long)address); + found_readable_mem = 1; + } + j = sprintf(line, REGFMT":", (unsigned long)addr); + for (k = 0; k < bytes_per_line; ++k) + j += sprintf(&line[j], " %02x", buf[k]); + pr_err("%s\n", line); + } + if (!found_readable_mem) + pr_err("No readable memory around address 0x%lx\n", + (unsigned long)address); +} + +void trace_unhandled_signal(const char *type, struct pt_regs *regs, + unsigned long address, int sig) +{ + struct task_struct *tsk = current; + + if (show_unhandled_signals == 0) + return; + + /* If the signal is handled, don't show it here. */ + if (!is_global_init(tsk)) { + void __user *handler = + tsk->sighand->action[sig-1].sa.sa_handler; + if (handler != SIG_IGN && handler != SIG_DFL) + return; + } + + /* Rate-limit the one-line output, not the detailed output. */ + if (show_unhandled_signals <= 1 && !printk_ratelimit()) + return; + + printk("%s%s[%d]: %s at %lx pc "REGFMT" signal %d", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), type, address, regs->pc, sig); + + print_vma_addr(KERN_CONT " in ", regs->pc); + + printk(KERN_CONT "\n"); + + if (show_unhandled_signals > 1) { + switch (sig) { + case SIGILL: + case SIGFPE: + case SIGSEGV: + case SIGBUS: + pr_err("User crash: signal %d," + " trap %ld, address 0x%lx\n", + sig, regs->faultnum, address); + show_regs(regs); + dump_mem((void __user *)address); + break; + default: + pr_err("User crash: signal %d, trap %ld\n", + sig, regs->faultnum); + break; + } + } +} diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c index 86df5a239b7..4032ca8e51b 100644 --- a/arch/tile/kernel/single_step.c +++ b/arch/tile/kernel/single_step.c @@ -186,6 +186,8 @@ static tile_bundle_bits rewrite_load_store_unaligned( .si_code = SEGV_MAPERR, .si_addr = addr }; + trace_unhandled_signal("segfault", regs, + (unsigned long)addr, SIGSEGV); force_sig_info(info.si_signo, &info, current); return (tile_bundle_bits) 0; } @@ -196,6 +198,8 @@ static tile_bundle_bits rewrite_load_store_unaligned( .si_code = BUS_ADRALN, .si_addr = addr }; + trace_unhandled_signal("unaligned trap", regs, + (unsigned long)addr, SIGBUS); force_sig_info(info.si_signo, &info, current); return (tile_bundle_bits) 0; } diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 5474fc2e77e..f9803dfa735 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -308,6 +308,7 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num, info.si_addr = (void __user *)address; if (signo == SIGILL) info.si_trapno = fault_num; + trace_unhandled_signal("trap", regs, address, signo); force_sig_info(signo, &info, current); } diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 24ca54a0703..25b7b90fd62 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -43,8 +43,11 @@ #include -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, int fault_num, struct task_struct *tsk) +static noinline void force_sig_info_fault(const char *type, int si_signo, + int si_code, unsigned long address, + int fault_num, + struct task_struct *tsk, + struct pt_regs *regs) { siginfo_t info; @@ -59,6 +62,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, info.si_code = si_code; info.si_addr = (void __user *)address; info.si_trapno = fault_num; + trace_unhandled_signal(type, regs, address, si_signo); force_sig_info(si_signo, &info, tsk); } @@ -71,11 +75,12 @@ SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address, struct pt_regs *, regs) { if (address >= PAGE_OFFSET) - force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address, - INT_DTLB_MISS, current); + force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR, + address, INT_DTLB_MISS, current, regs); else - force_sig_info_fault(SIGBUS, BUS_ADRALN, address, - INT_UNALIGN_DATA, current); + force_sig_info_fault("atomic alignment fault", SIGBUS, + BUS_ADRALN, address, + INT_UNALIGN_DATA, current, regs); /* * Adjust pc to point at the actual instruction, which is unusual @@ -471,8 +476,8 @@ bad_area_nosemaphore: */ local_irq_enable(); - force_sig_info_fault(SIGSEGV, si_code, address, - fault_num, tsk); + force_sig_info_fault("segfault", SIGSEGV, si_code, address, + fault_num, tsk, regs); return 0; } @@ -547,7 +552,8 @@ do_sigbus: if (is_kernel_mode) goto no_context; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk); + force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address, + fault_num, tsk, regs); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b1..aaec9342a33 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1496,7 +1496,7 @@ static struct ctl_table fs_table[] = { static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) + defined(CONFIG_S390) || defined(CONFIG_TILE) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.3-70-g09d2 From 4e2d9491a78929badcf774869b458486acb96365 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 May 2011 00:21:26 +0200 Subject: PM / Hibernate: Update some comments in core hibernate code Some comments in the core hibernate code are outdated, some aren't necessary any more and at least one of them is plain wrong. Remove those comments or update them. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f9bec56d882..6418d8c8cdb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -248,12 +248,6 @@ static int create_image(int platform_mode) if (error) return error; - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " @@ -297,9 +291,6 @@ static int create_image(int platform_mode) Power_up: syscore_resume(); - /* NOTE: dpm_resume_noirq() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ Enable_irqs: local_irq_enable(); @@ -416,24 +407,26 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Enable_irqs; - /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); if (!error) { error = swsusp_arch_resume(); /* * The code below is only ever reached in case of a failure. - * Otherwise execution continues at place where - * swsusp_arch_suspend() was called + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. */ BUG_ON(!error); - /* This call to restore_highmem() undos the previous one */ + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ restore_highmem(); } /* * The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid - * subsequent failures + * subsequent failures. */ swsusp_free(); restore_processor_state(); -- cgit v1.2.3-70-g09d2 From 6e9101aeec39961308176e0f59e73ac5d37d243a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 May 2011 05:43:18 +0200 Subject: watchdog: Fix non-standard prototype of get_softlockup_thresh() This build warning slipped through: kernel/watchdog.c:102: warning: function declaration isn't a prototype As reported by Stephen Rothwell. Also address an unused variable warning that GCC 4.6.0 reports: we cannot do anything about failed watchdog ops during CPU hotplug (it's not serious enough to return an error from the notifier), so ignore them. Reported-by: Stephen Rothwell Cc: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110524134129.8da27016.sfr@canb.auug.org.au Signed-off-by: Ingo Molnar LKML-Reference: <20110517071642.GF22305@elte.hu> --- kernel/watchdog.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e63097fa73..3d0c56ad479 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -98,7 +98,7 @@ __setup("nosoftlockup", nosoftlockup_setup); * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ -static int get_softlockup_thresh() +static int get_softlockup_thresh(void) { return watchdog_thresh * 2; } @@ -415,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static int watchdog_prepare_cpu(int cpu) +static void watchdog_prepare_cpu(int cpu) { struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); WARN_ON(per_cpu(softlockup_watchdog, cpu)); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - - return 0; } static int watchdog_enable(int cpu) @@ -542,17 +540,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; - int err = 0; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - err = watchdog_prepare_cpu(hotcpu); + watchdog_prepare_cpu(hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: if (watchdog_enabled) - err = watchdog_enable(hotcpu); + watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: -- cgit v1.2.3-70-g09d2 From 8af088710d1eb3c980e0ef3779c8d47f3f217b48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 May 2011 11:12:58 +0200 Subject: posix-timers: RCU conversion Ben Nagy reported a scalability problem with KVM/QEMU that hit very hard a single spinlock (idr_lock) in posix-timers code, on its 48 core machine. Even on a 16 cpu machine (2x4x2), a single test can show 98% of cpu time used in ticket_spin_lock, from lock_timer Ref: http://www.spinics.net/lists/kvm/msg51526.html Switching to RCU is quite easy, IDR being already RCU ready. idr_lock should be locked only for an insert/delete, not a lookup. Benchmark on a 2x4x2 machine, 16 processes calling timer_gettime(). Before : real 1m18.669s user 0m1.346s sys 1m17.180s After : real 0m3.296s user 0m1.366s sys 0m1.926s Reported-by: Ben Nagy Signed-off-by: Eric Dumazet Tested-by: Ben Nagy Cc: Oleg Nesterov Cc: Avi Kivity Cc: John Stultz Cc: Richard Cochran Cc: Paul E. McKenney Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 1 + kernel/posix-timers.c | 25 ++++++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 808227d40a6..959c14132f4 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,6 +82,7 @@ struct k_itimer { unsigned long expires; } mmtimer; struct alarm alarmtimer; + struct rcu_head rcu; } it; }; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a1b5edf1bf9..4556182527f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; } +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + #define IT_ID_SET 1 #define IT_ID_NOT_SET 0 static void release_posix_timer(struct k_itimer *tmr, int it_id_set) @@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - kmem_cache_free(posix_timers_cache, tmr); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } static struct k_clock *clockid_to_kclock(const clockid_t id) @@ -631,22 +638,18 @@ out: static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; - /* - * Watch out here. We do a irqsave on the idr_lock and pass the - * flags part over to the timer lock. Must not let interrupts in - * while we are moving the lock. - */ - spin_lock_irqsave(&idr_lock, *flags); + + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { - spin_lock(&timr->it_lock); + spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { - spin_unlock(&idr_lock); + rcu_read_unlock(); return timr; } - spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&timr->it_lock, *flags); } - spin_unlock_irqrestore(&idr_lock, *flags); + rcu_read_unlock(); return NULL; } -- cgit v1.2.3-70-g09d2 From 354258011e8e86961f7a72ad154ca8caf0c4c6f7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 May 2011 23:35:55 +0200 Subject: PM / Hibernate: Remove arch_prepare_suspend() All architectures supporting hibernation define arch_prepare_suspend() as an empty function, so remove it. Signed-off-by: Rafael J. Wysocki --- arch/frv/include/asm/suspend.h | 20 -------------------- arch/mips/include/asm/suspend.h | 2 -- arch/powerpc/include/asm/suspend.h | 6 ------ arch/powerpc/kernel/swsusp.c | 1 - arch/s390/include/asm/suspend.h | 10 ---------- arch/sh/include/asm/suspend.h | 1 - arch/unicore32/include/asm/suspend.h | 1 - arch/x86/include/asm/suspend_32.h | 2 -- arch/x86/include/asm/suspend_64.h | 5 ----- kernel/power/hibernate.c | 5 ----- 10 files changed, 53 deletions(-) delete mode 100644 arch/frv/include/asm/suspend.h delete mode 100644 arch/powerpc/include/asm/suspend.h delete mode 100644 arch/s390/include/asm/suspend.h (limited to 'kernel') diff --git a/arch/frv/include/asm/suspend.h b/arch/frv/include/asm/suspend.h deleted file mode 100644 index 5fa7b5a6ee4..00000000000 --- a/arch/frv/include/asm/suspend.h +++ /dev/null @@ -1,20 +0,0 @@ -/* suspend.h: suspension stuff - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ASM_SUSPEND_H -#define _ASM_SUSPEND_H - -static inline int arch_prepare_suspend(void) -{ - return 0; -} - -#endif /* _ASM_SUSPEND_H */ diff --git a/arch/mips/include/asm/suspend.h b/arch/mips/include/asm/suspend.h index 294cdb66c5f..3adac3b53d1 100644 --- a/arch/mips/include/asm/suspend.h +++ b/arch/mips/include/asm/suspend.h @@ -1,8 +1,6 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -static inline int arch_prepare_suspend(void) { return 0; } - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; diff --git a/arch/powerpc/include/asm/suspend.h b/arch/powerpc/include/asm/suspend.h deleted file mode 100644 index c6efc3466aa..00000000000 --- a/arch/powerpc/include/asm/suspend.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_POWERPC_SUSPEND_H -#define __ASM_POWERPC_SUSPEND_H - -static inline int arch_prepare_suspend(void) { return 0; } - -#endif /* __ASM_POWERPC_SUSPEND_H */ diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 560c9611950..aa17b76dd42 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/arch/s390/include/asm/suspend.h b/arch/s390/include/asm/suspend.h deleted file mode 100644 index dc75c616eaf..00000000000 --- a/arch/s390/include/asm/suspend.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __ASM_S390_SUSPEND_H -#define __ASM_S390_SUSPEND_H - -static inline int arch_prepare_suspend(void) -{ - return 0; -} - -#endif - diff --git a/arch/sh/include/asm/suspend.h b/arch/sh/include/asm/suspend.h index 64eb41a063e..e14567a7e9a 100644 --- a/arch/sh/include/asm/suspend.h +++ b/arch/sh/include/asm/suspend.h @@ -3,7 +3,6 @@ #ifndef __ASSEMBLY__ #include -static inline int arch_prepare_suspend(void) { return 0; } #include diff --git a/arch/unicore32/include/asm/suspend.h b/arch/unicore32/include/asm/suspend.h index 88a9c0f32b2..65bad75c7e9 100644 --- a/arch/unicore32/include/asm/suspend.h +++ b/arch/unicore32/include/asm/suspend.h @@ -14,7 +14,6 @@ #define __UNICORE_SUSPEND_H__ #ifndef __ASSEMBLY__ -static inline int arch_prepare_suspend(void) { return 0; } #include diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index fd921c3a684..487055c8c1a 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -9,8 +9,6 @@ #include #include -static inline int arch_prepare_suspend(void) { return 0; } - /* image of the saved processor state */ struct saved_context { u16 es, fs, gs, ss; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 8d942afae68..09b0bf10415 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -9,11 +9,6 @@ #include #include -static inline int arch_prepare_suspend(void) -{ - return 0; -} - /* * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6418d8c8cdb..16aa3bcd6b5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "power.h" @@ -244,10 +243,6 @@ static int create_image(int platform_mode) { int error; - error = arch_prepare_suspend(); - if (error) - return error; - error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " -- cgit v1.2.3-70-g09d2 From f42a9813fbf930fea3bdd0524dcb43c7feb0c977 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 May 2011 23:36:06 +0200 Subject: PM / Hibernate: Update kerneldoc comments in hibernate.c Some of the kerneldoc comments in kernel/power/hibernate.c are outdated and some of them don't adhere to the kernel's standards. Update them and make them look in a consistent way. Signed-off-by: Rafael J. Wysocki Acked-by: Randy Dunlap --- kernel/power/hibernate.c | 194 +++++++++++++++++++++++++---------------------- 1 file changed, 104 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 16aa3bcd6b5..8f7b1db1ece 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -54,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN; static const struct platform_hibernation_ops *hibernation_ops; /** - * hibernation_set_ops - set the global hibernate operations - * @ops: the hibernation operations to use in subsequent hibernation transitions + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. */ - void hibernation_set_ops(const struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot @@ -114,10 +113,9 @@ static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ /** - * platform_begin - tell the platform driver that we're starting - * hibernation + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. */ - static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? @@ -125,10 +123,9 @@ static int platform_begin(int platform_mode) } /** - * platform_end - tell the platform driver that we've entered the - * working state + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. */ - static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) @@ -136,8 +133,11 @@ static void platform_end(int platform_mode) } /** - * platform_pre_snapshot - prepare the machine for hibernation using the - * platform driver if so configured and return an error code if it fails + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. */ static int platform_pre_snapshot(int platform_mode) @@ -147,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode) } /** - * platform_leave - prepare the machine for switching to the normal mode - * of operation using the platform driver (called with interrupts disabled) + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. */ - static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) @@ -158,10 +162,14 @@ static void platform_leave(int platform_mode) } /** - * platform_finish - switch the machine to the normal mode of operation - * using the platform driver (must be called after platform_prepare()) + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). */ - static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) @@ -169,11 +177,15 @@ static void platform_finish(int platform_mode) } /** - * platform_pre_restore - prepare the platform for the restoration from a - * hibernation image. If the restore fails after this function has been - * called, platform_restore_cleanup() must be called. + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. */ - static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? @@ -181,12 +193,16 @@ static int platform_pre_restore(int platform_mode) } /** - * platform_restore_cleanup - switch the platform to the normal mode of - * operation after a failing restore. If platform_pre_restore() has been - * called before the failing restore, this function must be called too, - * regardless of the result of platform_pre_restore(). + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). */ - static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) @@ -194,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode) } /** - * platform_recover - recover the platform from a failure to suspend - * devices. + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. */ - static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) @@ -205,13 +220,12 @@ static void platform_recover(int platform_mode) } /** - * swsusp_show_speed - print the time elapsed between two events. - * @start: Starting event. - * @stop: Final event. - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. */ - void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { @@ -234,11 +248,14 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, } /** - * create_image - freeze devices that need to be frozen with interrupts - * off, create the hibernation image and thaw those devices. Control - * reappears in this routine after a restore. + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image + * and execute the drivers' .thaw_noirq() callbacks. + * + * Control reappears in this routine after the subsequent restore. */ - static int create_image(int platform_mode) { int error; @@ -303,14 +320,11 @@ static int create_image(int platform_mode) } /** - * hibernation_snapshot - quiesce devices and create the hibernation - * snapshot image. - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the power transition. + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. * - * Must be called with pm_mutex held + * This routine must be called with pm_mutex held. */ - int hibernation_snapshot(int platform_mode) { pm_message_t msg = PMSG_RECOVER; @@ -370,13 +384,14 @@ int hibernation_snapshot(int platform_mode) } /** - * resume_target_kernel - prepare devices that need to be suspended with - * interrupts off, restore the contents of highmem that have not been - * restored yet from the image and run the low level code that will restore - * the remaining contents of memory and switch to the just restored target - * kernel. + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, restore the contents of + * highmem that have not been restored yet from the image and run the low-level + * code that will restore the remaining contents of memory and switch to the + * just restored target kernel. */ - static int resume_target_kernel(bool platform_mode) { int error; @@ -444,14 +459,12 @@ static int resume_target_kernel(bool platform_mode) } /** - * hibernation_restore - quiesce devices and restore the hibernation - * snapshot image. If successful, control returns in hibernation_snaphot() - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the transition. + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. * - * Must be called with pm_mutex held + * This routine must be called with pm_mutex held. If it is successful, control + * reappears in the restored target kernel in hibernation_snaphot(). */ - int hibernation_restore(int platform_mode) { int error; @@ -471,10 +484,8 @@ int hibernation_restore(int platform_mode) } /** - * hibernation_platform_enter - enter the hibernation state using the - * platform driver (if available) + * hibernation_platform_enter - Power off the system using the platform driver. */ - int hibernation_platform_enter(void) { int error; @@ -545,12 +556,12 @@ int hibernation_platform_enter(void) } /** - * power_down - Shut the machine down for hibernation. + * power_down - Shut the machine down for hibernation. * - * Use the platform driver, if configured so; otherwise try - * to power off or reboot. + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. */ - static void power_down(void) { switch (hibernation_mode) { @@ -587,9 +598,8 @@ static int prepare_processes(void) } /** - * hibernate - The granpappy of the built-in hibernation management + * hibernate - Carry out system hibernation, including saving the image. */ - int hibernate(void) { int error; @@ -667,17 +677,20 @@ int hibernate(void) /** - * software_resume - Resume from a saved image. + * software_resume - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in hibernate() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. */ - static int software_resume(void) { int error; @@ -807,21 +820,17 @@ static const char * const hibernation_modes[] = { [HIBERNATION_TESTPROC] = "testproc", }; -/** - * disk - Control hibernation mode - * - * Suspend-to-disk can be handled in several ways. We have a few options - * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other hibernation_ops), powering off the system or rebooting the - * system (for testing) as well as the two test modes. +/* + * /sys/power/disk - Control hibernation mode. * - * The system can support 'platform', and that is known a priori (and - * encoded by the presence of hibernation_ops). However, the user may - * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the - * test modes, 'test' or 'testproc'. + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly), or using one of the two available test modes. * - * show() will display what the mode is currently set to. - * store() will accept one of + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 5 modes that can be supported: * * 'platform' * 'shutdown' @@ -829,8 +838,14 @@ static const char * const hibernation_modes[] = { * 'test' * 'testproc' * - * It will only change to 'platform' if the system - * supports it (as determined by having hibernation_ops). + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. */ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -863,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, return buf-start; } - static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { -- cgit v1.2.3-70-g09d2 From e4c70a6629f9c74c4b0de258a3951890e9047c82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:03 -0700 Subject: lockdep, mutex: provide mutex_lock_nest_lock In order to convert i_mmap_lock to a mutex we need a mutex equivalent to spin_lock_nest_lock(), thus provide the mutex_lock_nest_lock() annotation. As with spin_lock_nest_lock(), mutex_lock_nest_lock() allows annotation of the locking pattern where an outer lock serializes the acquisition order of nested locks. That is, if every time you lock multiple locks A, say A1 and A2 you first acquire N, the order of acquiring A1 and A2 is irrelevant. Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 3 +++ include/linux/mutex.h | 9 +++++++++ kernel/mutex.c | 25 +++++++++++++++++-------- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 4aef1dda640..ef820a3c378 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -487,12 +487,15 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) +# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) # else # define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) +# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) # endif # define mutex_release(l, n, i) lock_release(l, n, i) #else # define mutex_acquire(l, s, t, i) do { } while (0) +# define mutex_acquire_nest(l, s, t, n, i) do { } while (0) # define mutex_release(l, n, i) do { } while (0) #endif diff --git a/include/linux/mutex.h b/include/linux/mutex.h index c75471db576..a940fe435ac 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -132,6 +132,7 @@ static inline int mutex_is_locked(struct mutex *lock) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); +extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check mutex_lock_killable_nested(struct mutex *lock, @@ -140,6 +141,13 @@ extern int __must_check mutex_lock_killable_nested(struct mutex *lock, #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) + +#define mutex_lock_nest_lock(lock, nest_lock) \ +do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ +} while (0) + #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); @@ -148,6 +156,7 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) #endif /* diff --git a/kernel/mutex.c b/kernel/mutex.c index 2c938e2337c..d607ed5dd44 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; preempt_disable(); - mutex_acquire(&lock->dep_map, subclass, 0, ip); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* @@ -269,16 +269,25 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, _RET_IP_); + subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } #endif -- cgit v1.2.3-70-g09d2 From 97a894136f29802da19a15541de3c019e1ca147e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:04 -0700 Subject: mm: Remove i_mmap_lock lockbreak Hugh says: "The only significant loser, I think, would be page reclaim (when concurrent with truncation): could spin for a long time waiting for the i_mmap_mutex it expects would soon be dropped? " Counter points: - cpu contention makes the spin stop (need_resched()) - zap pages should be freeing pages at a higher rate than reclaim ever can I think the simplification of the truncate code is definitely worth it. Effectively reverts: 2aa15890f3c ("mm: prevent concurrent unmap_mapping_range() on the same inode") and takes out the code that caused its problem. Signed-off-by: Peter Zijlstra Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 - include/linux/fs.h | 2 - include/linux/mm.h | 2 - include/linux/mm_types.h | 1 - kernel/fork.c | 1 - mm/memory.c | 195 +++++++---------------------------------------- mm/mmap.c | 13 +--- mm/mremap.c | 1 - 8 files changed, 28 insertions(+), 188 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index 05f4fa52132..7a7284c71ab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -331,7 +331,6 @@ void address_space_init_once(struct address_space *mapping) spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); - mutex_init(&mapping->unmap_mutex); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index cdf9495df20..5d2c86bdf5b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -635,7 +635,6 @@ struct address_space { struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ spinlock_t i_mmap_lock; /* protect tree, count, list */ - unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ @@ -644,7 +643,6 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ - struct mutex unmap_mutex; /* to protect unmapping */ } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but diff --git a/include/linux/mm.h b/include/linux/mm.h index ffcce9bf2b5..2ad0ac8c3f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -895,8 +895,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ - unsigned long truncate_count; /* Compare vm_truncate_count */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 02aa5619709..201998e5b53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,7 +175,6 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ - unsigned long vm_truncate_count;/* truncate_count or restart_addr */ #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ diff --git a/kernel/fork.c b/kernel/fork.c index 2b44d82b823..4eef925477f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -386,7 +386,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) spin_lock(&mapping->i_mmap_lock); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); diff --git a/mm/memory.c b/mm/memory.c index 17193d74f30..18655878b9f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { struct mm_struct *mm = tlb->mm; int force_flush = 0; - pte_t *pte; - spinlock_t *ptl; int rss[NR_MM_COUNTERS]; + spinlock_t *ptl; + pte_t *pte; again: init_rss_vec(rss); @@ -1001,12 +1001,9 @@ again: do { pte_t ptent = *pte; if (pte_none(ptent)) { - (*zap_work)--; continue; } - (*zap_work) -= PAGE_SIZE; - if (pte_present(ptent)) { struct page *page; @@ -1075,7 +1072,7 @@ again: print_bad_pte(vma, addr, ptent, NULL); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1099,7 +1096,7 @@ again: static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pmd_t *pmd; unsigned long next; @@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) { - (*zap_work)--; + } else if (zap_huge_pmd(tlb, vma, pmd)) continue; - } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) { - (*zap_work)--; + if (pmd_none_or_clear_bad(pmd)) continue; - } - next = zap_pte_range(tlb, vma, pmd, addr, next, - zap_work, details); - } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pte_range(tlb, vma, pmd, addr, next, details); + cond_resched(); + } while (pmd++, addr = next, addr != end); return addr; } @@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pud_t *pud; unsigned long next; @@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { - (*zap_work)--; + if (pud_none_or_clear_bad(pud)) continue; - } - next = zap_pmd_range(tlb, vma, pud, addr, next, - zap_work, details); - } while (pud++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pmd_range(tlb, vma, pud, addr, next, details); + } while (pud++, addr = next, addr != end); return addr; } @@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, static unsigned long unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) { - (*zap_work)--; + if (pgd_none_or_clear_bad(pgd)) continue; - } - next = zap_pud_range(tlb, vma, pgd, addr, next, - zap_work, details); - } while (pgd++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pud_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); mem_cgroup_uncharge_end(); @@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { - long zap_work = ZAP_BLOCK_SIZE; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); @@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, * Since no pte has actually been setup, it is * safe to do nothing in this case. */ - if (vma->vm_file) { + if (vma->vm_file) unmap_hugepage_range(vma, start, end, NULL); - zap_work -= (end - start) / - pages_per_huge_page(hstate_vma(vma)); - } start = end; } else - start = unmap_page_range(tlb, vma, - start, end, &zap_work, details); - - if (zap_work > 0) { - BUG_ON(start != end); - break; - } - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) - goto out; - cond_resched(); - } - - zap_work = ZAP_BLOCK_SIZE; + start = unmap_page_range(tlb, vma, start, end, details); } } -out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -2612,96 +2579,11 @@ unwritable_page: return ret; } -/* - * Helper functions for unmap_mapping_range(). - * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ - * - * We have to restart searching the prio_tree whenever we drop the lock, - * since the iterator is only valid while the lock is held, and anyway - * a later vma might be split and reinserted earlier while lock dropped. - * - * The list of nonlinear vmas could be handled more efficiently, using - * a placeholder, but handle it in the same way until a need is shown. - * It is important to search the prio_tree before nonlinear list: a vma - * may become nonlinear and be shifted from prio_tree to nonlinear list - * while the lock is dropped; but never shifted from list to prio_tree. - * - * In order to make forward progress despite restarting the search, - * vm_truncate_count is used to mark a vma as now dealt with, so we can - * quickly skip it next time around. Since the prio_tree search only - * shows us those vmas affected by unmapping the range in question, we - * can't efficiently keep all vmas in step with mapping->truncate_count: - * so instead reset them all whenever it wraps back to 0 (then go to 1). - * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. - * - * In order to make forward progress despite repeatedly restarting some - * large vma, note the restart_addr from unmap_vmas when it breaks out: - * and restart from that address when we reach that vma again. It might - * have been split or merged, shrunk or extended, but never shifted: so - * restart_addr remains valid so long as it remains in the vma's range. - * unmap_mapping_range forces truncate_count to leap over page-aligned - * values so we can save vma's restart_addr in its truncate_count field. - */ -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) - -static void reset_vma_truncate_counts(struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - vma->vm_truncate_count = 0; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - vma->vm_truncate_count = 0; -} - -static int unmap_mapping_range_vma(struct vm_area_struct *vma, +static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - unsigned long restart_addr; - int need_break; - - /* - * files that support invalidating or truncating portions of the - * file from under mmaped areas must have their ->fault function - * return a locked page (and set VM_FAULT_LOCKED in the return). - * This provides synchronisation against concurrent unmapping here. - */ - -again: - restart_addr = vma->vm_truncate_count; - if (is_restart_addr(restart_addr) && start_addr < restart_addr) { - start_addr = restart_addr; - if (start_addr >= end_addr) { - /* Top of vma has been split off since last time */ - vma->vm_truncate_count = details->truncate_count; - return 0; - } - } - - restart_addr = zap_page_range(vma, start_addr, - end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); - - if (restart_addr >= end_addr) { - /* We have now completed this vma: mark it so */ - vma->vm_truncate_count = details->truncate_count; - if (!need_break) - return 0; - } else { - /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = restart_addr; - if (!need_break) - goto again; - } - - spin_unlock(details->i_mmap_lock); - cond_resched(); - spin_lock(details->i_mmap_lock); - return -EINTR; + zap_page_range(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct prio_tree_root *root, @@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; -restart: vma_prio_tree_foreach(vma, &iter, root, details->first_index, details->last_index) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; @@ -2728,11 +2606,10 @@ restart: if (zea > vea) zea = vea; - if (unmap_mapping_range_vma(vma, + unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details) < 0) - goto restart; + details); } } @@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ -restart: list_for_each_entry(vma, head, shared.vm_set.list) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; details->nonlinear_vma = vma; - if (unmap_mapping_range_vma(vma, vma->vm_start, - vma->vm_end, details) < 0) - goto restart; + unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } } @@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; - mutex_lock(&mapping->unmap_mutex); - spin_lock(&mapping->i_mmap_lock); - - /* Protect against endless unmapping loops */ - mapping->truncate_count++; - if (unlikely(is_restart_addr(mapping->truncate_count))) { - if (mapping->truncate_count == 0) - reset_vma_truncate_counts(mapping); - mapping->truncate_count++; - } - details.truncate_count = mapping->truncate_count; + spin_lock(&mapping->i_mmap_lock); if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); - mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/mmap.c b/mm/mmap.c index 40d49986e71..50cb04bb56b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -445,10 +445,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) mapping = vma->vm_file->f_mapping; - if (mapping) { + if (mapping) spin_lock(&mapping->i_mmap_lock); - vma->vm_truncate_count = mapping->truncate_count; - } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -558,16 +556,7 @@ again: remove_next = 1 + (end > next->vm_end); if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; spin_lock(&mapping->i_mmap_lock); - if (importer && - vma->vm_truncate_count != next->vm_truncate_count) { - /* - * unmap_mapping_range might be in progress: - * ensure that the expanding vma is rescanned. - */ - importer->vm_truncate_count = 0; - } if (insert) { - insert->vm_truncate_count = vma->vm_truncate_count; /* * Put into prio_tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page diff --git a/mm/mremap.c b/mm/mremap.c index a7c1f9f9b94..909e1e1e99b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,7 +94,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - new_vma->vm_truncate_count = 0; } /* -- cgit v1.2.3-70-g09d2 From 3d48ae45e72390ddf8cc5256ac32ed6f7a19cbea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:06 -0700 Subject: mm: Convert i_mmap_lock to a mutex Straightforward conversion of i_mmap_lock to a mutex. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/lockstat.txt | 2 +- Documentation/vm/locking | 2 +- arch/x86/mm/hugetlbpage.c | 4 ++-- fs/hugetlbfs/inode.c | 4 ++-- fs/inode.c | 2 +- include/linux/fs.h | 2 +- include/linux/mmu_notifier.h | 2 +- kernel/fork.c | 4 ++-- mm/filemap.c | 10 +++++----- mm/filemap_xip.c | 4 ++-- mm/fremap.c | 4 ++-- mm/hugetlb.c | 14 +++++++------- mm/memory-failure.c | 4 ++-- mm/memory.c | 4 ++-- mm/mmap.c | 22 +++++++++++----------- mm/mremap.c | 4 ++-- mm/rmap.c | 28 ++++++++++++++-------------- 17 files changed, 58 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 65f4c795015..9c0a80d17a2 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -136,7 +136,7 @@ View the top contending locks: dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 &inode->i_mutex: 161 286 18446744073709 62882.54 1244614.55 3653 20598 18446744073709 62318.60 1693822.74 &zone->lru_lock: 94 94 0.53 7.33 92.10 4366 32690 0.29 59.81 16350.06 - &inode->i_data.i_mmap_lock: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 + &inode->i_data.i_mmap_mutex: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 &q->__queue_lock: 48 50 0.52 31.62 86.31 774 13131 0.17 113.08 12277.52 &rq->rq_lock_key: 43 47 0.74 68.50 170.63 3706 33929 0.22 107.99 17460.62 &rq->rq_lock_key#2: 39 46 0.75 6.68 49.03 2979 32292 0.17 125.17 17137.63 diff --git a/Documentation/vm/locking b/Documentation/vm/locking index 25fadb44876..f61228bd639 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_mmap_lock and the kmem cache +The page_table_lock nests with the inode i_mmap_mutex and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index d4203988504..f581a18c0d4 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9eeb1cd03f..e7a035781b7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 7a7284c71ab..88aa3bcc768 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -326,7 +326,7 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); + mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d2c86bdf5b..5bb9e826019 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -634,7 +634,7 @@ struct address_space { unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct mutex i_mmap_mutex; /* protect tree, count, list */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cc2e7dfea9d..a877dfc243e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -150,7 +150,7 @@ struct mmu_notifier_ops { * Therefore notifier chains can only be traversed when either * * 1. mmap_sem is held. - * 2. One of the reverse map locks is held (i_mmap_lock or anon_vma->lock). + * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->lock). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { diff --git a/kernel/fork.c b/kernel/fork.c index 4eef925477f..927692734bc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -383,14 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/mm/filemap.c b/mm/filemap.c index 8144f87dcbb..88354ae0b1f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,16 +58,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (truncate_pagecache) + * ->i_mmap_mutex (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_mutex (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_mutex * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -84,7 +84,7 @@ * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_mutex * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -106,7 +106,7 @@ * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_lock + * ->i_mmap_mutex */ /* diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 83364df74a3..93356cd1282 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping, return; retry: - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -201,7 +201,7 @@ retry: page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (locked) { mutex_unlock(&xip_sparse_mutex); diff --git a/mm/fremap.c b/mm/fremap.c index ec520c7b28d..7f4123056e0 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -211,13 +211,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } goto out; } - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bbb4a5bbb95..5fd68b95c67 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long sz = huge_page_size(h); /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_mutex. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } /* @@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address, address + huge_page_size(h), page); } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return 1; } @@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); @@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); flush_tlb_range(vma, start, end); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2b9a5eef39e..12178ec32ab 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -429,7 +429,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, */ read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -449,7 +449,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); read_unlock(&tasklist_lock); } diff --git a/mm/memory.c b/mm/memory.c index 18655878b9f..7bbe4d3df75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2667,12 +2667,12 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/mmap.c b/mm/mmap.c index 50cb04bb56b..26efbfca0b2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -194,7 +194,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_mutex */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -222,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } } @@ -446,13 +446,13 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, mapping = vma->vm_file->f_mapping; if (mapping) - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mm->map_count++; validate_mm(mm); @@ -555,7 +555,7 @@ again: remove_next = 1 + (end > next->vm_end); mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* * Put into prio_tree now, so instantiated pages @@ -622,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) anon_vma_unlock(anon_vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (remove_next) { if (file) { @@ -2290,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_mutex is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2532,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); + mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); } } @@ -2559,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->lock outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -2631,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping) * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); diff --git a/mm/mremap.c b/mm/mremap.c index 909e1e1e99b..506fa44403d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -93,7 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); } /* @@ -122,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } diff --git a/mm/rmap.c b/mm/rmap.c index 522e4a93cad..f0ef7ea5423 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_mutex * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -646,14 +646,14 @@ static int page_referenced_file(struct page *page, * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_mutex. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_mutex does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -675,7 +675,7 @@ static int page_referenced_file(struct page *page, break; } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return referenced; } @@ -762,7 +762,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); @@ -771,7 +771,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) ret += page_mkclean_one(page, vma, address); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1119,7 +1119,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->lock or mapping->i_mmap_lock. + * we now hold anon_vma->lock or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1345,7 +1345,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1391,7 +1391,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1413,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1425,7 +1425,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1544,7 +1544,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, if (!mapping) return ret; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1558,7 +1558,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, * never contain migration ptes. Decide what to do about this * limitation to linear when we need rmap_walk() on nonlinear. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From de03c72cfce5b263a674d04348b58475ec50163c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:12:15 -0700 Subject: mm: convert mm->cpu_vm_cpumask into cpumask_var_t cpumask_t is very big struct and cpu_vm_mask is placed wrong position. It might lead to reduce cache hit ratio. This patch has two change. 1) Move the place of cpumask into last of mm_struct. Because usually cpumask is accessed only front bits when the system has cpu-hotplug capability 2) Convert cpu_vm_mask into cpumask_var_t. It may help to reduce memory footprint if cpumask_size() will use nr_cpumask_bits properly in future. In addition, this patch change the name of cpu_vm_mask with cpu_vm_mask_var. It may help to detect out of tree cpu_vm_mask users. This patch has no functional change. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KOSAKI Motohiro Cc: David Howells Cc: Koichi Yasutake Cc: Hugh Dickins Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 2 +- arch/x86/kernel/tboot.c | 1 - include/linux/mm_types.h | 9 ++++++--- include/linux/sched.h | 1 + init/main.c | 2 ++ kernel/fork.c | 37 ++++++++++++++++++++++++++++++++++--- mm/init-mm.c | 1 - 7 files changed, 44 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 9164ae3b83b..9b728dc1753 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -16,7 +16,7 @@ on all processors in the system. Don't let this scare you into thinking SMP cache/tlb flushing must be so inefficient, this is in fact an area where many optimizations are possible. For example, if it can be proven that a user address space has never executed -on a cpu (see vma->cpu_vm_mask), one need not perform a flush +on a cpu (see mm_cpumask()), one need not perform a flush for this address space on that cpu. First, the TLB flushing interfaces, since they are the simplest. The diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 998e972f3b1..30ac65df7d4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), - .cpu_vm_mask = CPU_MASK_ALL, }; static inline void switch_to_tboot_pt(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 201998e5b53..c2f9ea7922f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -265,8 +265,6 @@ struct mm_struct { struct linux_binfmt *binfmt; - cpumask_t cpu_vm_mask; - /* Architecture-specific MM context */ mm_context_t context; @@ -316,9 +314,14 @@ struct mm_struct { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif + + cpumask_var_t cpu_vm_mask_var; }; /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ -#define mm_cpumask(mm) (&(mm)->cpu_vm_mask) +static inline cpumask_t *mm_cpumask(struct mm_struct *mm) +{ + return mm->cpu_vm_mask_var; +} #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 44b8faaac7c..f18300eddfc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2176,6 +2176,7 @@ static inline void mmdrop(struct mm_struct * mm) if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); diff --git a/init/main.c b/init/main.c index 48df882d51d..22da33918ae 100644 --- a/init/main.c +++ b/init/main.c @@ -509,6 +509,8 @@ asmlinkage void __init start_kernel(void) sort_main_extable(); trap_init(); mm_init(); + BUG_ON(mm_init_cpumask(&init_mm, 0)); + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() diff --git a/kernel/fork.c b/kernel/fork.c index 927692734bc..8e7e135d081 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -485,6 +485,20 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) + return -ENOMEM; + + if (oldmm) + cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); + else + memset(mm_cpumask(mm), 0, cpumask_size()); +#endif + return 0; +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -521,10 +535,20 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm, current); + if (!mm) + return NULL; + + if (mm_init_cpumask(mm, NULL)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } + return mm; } @@ -536,6 +560,7 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -690,6 +715,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; + if (mm_init_cpumask(mm, oldmm)) + goto fail_nocpumask; + if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -716,6 +744,9 @@ fail_nomem: return NULL; fail_nocontext: + free_cpumask_var(mm->cpu_vm_mask_var); + +fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() diff --git a/mm/init-mm.c b/mm/init-mm.c index 1d29cdfe8eb..4019979b263 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -21,6 +21,5 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), - .cpu_vm_mask = CPU_MASK_ALL, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3-70-g09d2 From 4b060420a596095869a6d7849caa798d23839cd1 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 24 May 2011 17:13:12 -0700 Subject: bitmap, irq: add smp_affinity_list interface to /proc/irq Manually adjusting the smp_affinity for IRQ's becomes unwieldy when the cpu count is large. Setting smp affinity to cpus 256 to 263 would be: echo 000000ff,00000000,00000000,00000000,00000000,00000000,00000000,00000000 > smp_affinity instead of: echo 256-263 > smp_affinity_list Think about what it looks like for cpus around say, 4088 to 4095. We already have many alternate "list" interfaces: /sys/devices/system/cpu/cpuX/indexY/shared_cpu_list /sys/devices/system/cpu/cpuX/topology/thread_siblings_list /sys/devices/system/cpu/cpuX/topology/core_siblings_list /sys/devices/system/node/nodeX/cpulist /sys/devices/pci***/***/local_cpulist Add a companion interface, smp_affinity_list to use cpu lists instead of cpu maps. This conforms to other companion interfaces where both a map and a list interface exists. This required adding a bitmap_parselist_user() function in a manner similar to the bitmap_parse_user() function. [akpm@linux-foundation.org: make __bitmap_parselist() static] Signed-off-by: Mike Travis Cc: Thomas Gleixner Cc: Jack Steiner Cc: Lee Schermerhorn Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/IRQ-affinity.txt | 17 ++++-- Documentation/filesystems/proc.txt | 11 +++- include/linux/bitmap.h | 5 +- include/linux/cpumask.h | 15 +++++ kernel/irq/proc.c | 54 ++++++++++++++++-- lib/bitmap.c | 109 +++++++++++++++++++++++++++++++++---- 6 files changed, 188 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt index b4a615b7840..7890fae1852 100644 --- a/Documentation/IRQ-affinity.txt +++ b/Documentation/IRQ-affinity.txt @@ -4,10 +4,11 @@ ChangeLog: SMP IRQ affinity -/proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted -for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed -to turn off all CPUs, and if an IRQ controller does not support IRQ -affinity then the value will not change from the default 0xffffffff. +/proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify +which target CPUs are permitted for a given IRQ source. It's a bitmask +(smp_affinity) or cpu list (smp_affinity_list) of allowed CPUs. It's not +allowed to turn off all CPUs, and if an IRQ controller does not support +IRQ affinity then the value will not change from the default of all cpus. /proc/irq/default_smp_affinity specifies default affinity mask that applies to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask @@ -54,3 +55,11 @@ round-trip min/avg/max = 0.1/0.5/585.4 ms This time around IRQ44 was delivered only to the last four processors. i.e counters for the CPU0-3 did not change. +Here is an example of limiting that same irq (44) to cpus 1024 to 1031: + +[root@moon 44]# echo 1024-1031 > smp_affinity +[root@moon 44]# cat smp_affinity +1024-1031 + +Note that to do this with a bitmask would require 32 bitmasks of zero +to follow the pertinent one. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 60740e8ecb3..f4817802406 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -574,6 +574,12 @@ The contents of each smp_affinity file is the same by default: > cat /proc/irq/0/smp_affinity ffffffff +There is an alternate interface, smp_affinity_list which allows specifying +a cpu range instead of a bitmask: + + > cat /proc/irq/0/smp_affinity_list + 1024-1031 + The default_smp_affinity mask applies to all non-active IRQs, which are the IRQs which have not yet been allocated/activated, and hence which lack a /proc/irq/[0-9]* directory. @@ -583,12 +589,13 @@ reports itself as being attached. This hardware locality information does not include information about any possible driver locality preference. prof_cpu_mask specifies which CPUs are to be profiled by the system wide -profiler. Default value is ffffffff (all cpus). +profiler. Default value is ffffffff (all cpus if there are only 32 of them). The way IRQs are routed is handled by the IO-APIC, and it's Round Robin between all the CPUs which are allowed to handle it. As usual the kernel has more info than you and does a better job than you, so the defaults are the -best choice for almost everyone. +best choice for almost everyone. [Note this applies only to those IO-APIC's +that support "Round Robin" interrupt distribution.] There are three more important subdirectories in /proc: net, scsi, and sys. The general rule is that the contents, or even the existence of these diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index daf8c480c78..dcafe0bf000 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -55,7 +55,8 @@ * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf - * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from list + * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf + * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region @@ -129,6 +130,8 @@ extern int bitmap_scnlistprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); extern int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits); +extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, + unsigned long *dst, int nbits); extern void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, int bits); extern int bitmap_bitremap(int oldbit, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bae6fe24d1f..b24ac56477b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -546,6 +546,21 @@ static inline int cpumask_parse_user(const char __user *buf, int len, return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } +/** + * cpumask_parselist_user - extract a cpumask from a user string + * @buf: the buffer to extract from + * @len: the length of the buffer + * @dstp: the cpumask to set. + * + * Returns -errno, or 0 for success. + */ +static inline int cpumask_parselist_user(const char __user *buf, int len, + struct cpumask *dstp) +{ + return bitmap_parselist_user(buf, len, cpumask_bits(dstp), + nr_cpumask_bits); +} + /** * cpulist_scnprintf - print a cpumask into a string as comma-separated list * @buf: the buffer to sprintf into diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 834899f2500..64e3df6ab1e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_proc_show(struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask = desc->irq_data.affinity; @@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif - seq_cpumask(m, mask); + if (type) + seq_cpumask_list(m, mask); + else + seq_cpumask(m, mask); seq_putc(m, '\n'); return 0; } @@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) #endif int no_irq_affinity; -static ssize_t irq_affinity_proc_write(struct file *file, +static int irq_affinity_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(0, m, v); +} + +static int irq_affinity_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(1, m, v); +} + + +static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; @@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; - err = cpumask_parse_user(buffer, count, new_value); + if (type) + err = cpumask_parselist_user(buffer, count, new_value); + else + err = cpumask_parse_user(buffer, count, new_value); if (err) goto free_cpumask; @@ -100,11 +117,28 @@ free_cpumask: return err; } +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(0, file, buffer, count, pos); +} + +static ssize_t irq_affinity_list_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(1, file, buffer, count, pos); +} + static int irq_affinity_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } +static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); +} + static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); @@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { .release = single_release, }; +static const struct file_operations irq_affinity_list_proc_fops = { + .open = irq_affinity_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_list_proc_write, +}; + static int default_affinity_show(struct seq_file *m, void *v) { seq_cpumask(m, irq_default_affinity); @@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("affinity_hint", 0400, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); + /* create /proc/irq//smp_affinity_list */ + proc_create_data("smp_affinity_list", 0600, desc->dir, + &irq_affinity_list_proc_fops, (void *)(long)irq); + proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, (void *)(long)irq); #endif diff --git a/lib/bitmap.c b/lib/bitmap.c index 91e0ccfdb42..41baf02924e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -571,8 +571,11 @@ int bitmap_scnlistprintf(char *buf, unsigned int buflen, EXPORT_SYMBOL(bitmap_scnlistprintf); /** - * bitmap_parselist - convert list format ASCII string to bitmap + * __bitmap_parselist - convert list format ASCII string to bitmap * @bp: read nul-terminated user string from this buffer + * @buflen: buffer size in bytes. If string is smaller than this + * then it must be terminated with a \0. + * @is_user: location of buffer, 0 indicates kernel space * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * @@ -587,20 +590,63 @@ EXPORT_SYMBOL(bitmap_scnlistprintf); * %-EINVAL: invalid character in string * %-ERANGE: bit number specified too large for mask */ -int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) +static int __bitmap_parselist(const char *buf, unsigned int buflen, + int is_user, unsigned long *maskp, + int nmaskbits) { unsigned a, b; + int c, old_c, totaldigits; + const char __user *ubuf = buf; + int exp_digit, in_range; + totaldigits = c = 0; bitmap_zero(maskp, nmaskbits); do { - if (!isdigit(*bp)) - return -EINVAL; - b = a = simple_strtoul(bp, (char **)&bp, BASEDEC); - if (*bp == '-') { - bp++; - if (!isdigit(*bp)) + exp_digit = 1; + in_range = 0; + a = b = 0; + + /* Get the next cpu# or a range of cpu#'s */ + while (buflen) { + old_c = c; + if (is_user) { + if (__get_user(c, ubuf++)) + return -EFAULT; + } else + c = *buf++; + buflen--; + if (isspace(c)) + continue; + + /* + * If the last character was a space and the current + * character isn't '\0', we've got embedded whitespace. + * This is a no-no, so throw an error. + */ + if (totaldigits && c && isspace(old_c)) + return -EINVAL; + + /* A '\0' or a ',' signal the end of a cpu# or range */ + if (c == '\0' || c == ',') + break; + + if (c == '-') { + if (exp_digit || in_range) + return -EINVAL; + b = 0; + in_range = 1; + exp_digit = 1; + continue; + } + + if (!isdigit(c)) return -EINVAL; - b = simple_strtoul(bp, (char **)&bp, BASEDEC); + + b = b * 10 + (c - '0'); + if (!in_range) + a = b; + exp_digit = 0; + totaldigits++; } if (!(a <= b)) return -EINVAL; @@ -610,13 +656,52 @@ int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) set_bit(a, maskp); a++; } - if (*bp == ',') - bp++; - } while (*bp != '\0' && *bp != '\n'); + } while (buflen && c == ','); return 0; } + +int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) +{ + char *nl = strchr(bp, '\n'); + int len; + + if (nl) + len = nl - bp; + else + len = strlen(bp); + + return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); +} EXPORT_SYMBOL(bitmap_parselist); + +/** + * bitmap_parselist_user() + * + * @ubuf: pointer to user buffer containing string. + * @ulen: buffer size in bytes. If string is smaller than this + * then it must be terminated with a \0. + * @maskp: pointer to bitmap array that will contain result. + * @nmaskbits: size of bitmap, in bits. + * + * Wrapper for bitmap_parselist(), providing it with user buffer. + * + * We cannot have this as an inline function in bitmap.h because it needs + * linux/uaccess.h to get the access_ok() declaration and this causes + * cyclic dependencies. + */ +int bitmap_parselist_user(const char __user *ubuf, + unsigned int ulen, unsigned long *maskp, + int nmaskbits) +{ + if (!access_ok(VERIFY_READ, ubuf, ulen)) + return -EFAULT; + return __bitmap_parselist((const char *)ubuf, + ulen, 1, maskp, nmaskbits); +} +EXPORT_SYMBOL(bitmap_parselist_user); + + /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap -- cgit v1.2.3-70-g09d2 From 162a7e7500f9664636e649ba59defe541b7c2c60 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 24 May 2011 17:13:20 -0700 Subject: printk: allocate kernel log buffer earlier On larger systems, because of the numerous ACPI, Bootmem and EFI messages, the static log buffer overflows before the larger one specified by the log_buf_len param is allocated. Minimize the overflow by allocating the new log buffer as soon as possible. On kernels without memblock, a later call to setup_log_buf from kernel/init.c is the fallback. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_PRINTK=n build] Signed-off-by: Mike Travis Cc: Yinghai Lu Cc: "H. Peter Anvin" Cc: Jack Steiner Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 2 ++ include/linux/printk.h | 7 ++++ init/main.c | 1 + kernel/printk.c | 87 ++++++++++++++++++++++++++++++++----------------- 4 files changed, 68 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 605e5ae19c7..a3e5948670c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -946,6 +946,8 @@ void __init setup_arch(char **cmdline_p) if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); #endif + /* Allocate bigger log buffer */ + setup_log_buf(1); reserve_initrd(); diff --git a/include/linux/printk.h b/include/linux/printk.h index ee048e77e1a..0101d55d965 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -1,6 +1,8 @@ #ifndef __KERNEL_PRINTK__ #define __KERNEL_PRINTK__ +#include + extern const char linux_banner[]; extern const char linux_proc_banner[]; @@ -113,6 +115,7 @@ extern int dmesg_restrict; extern int kptr_restrict; void log_buf_kexec_setup(void); +void __init setup_log_buf(int early); #else static inline __attribute__ ((format (printf, 1, 0))) int vprintk(const char *s, va_list args) @@ -137,6 +140,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, static inline void log_buf_kexec_setup(void) { } + +static inline void setup_log_buf(int early) +{ +} #endif extern void dump_stack(void) __cold; diff --git a/init/main.c b/init/main.c index 22da33918ae..d2f1e086bf3 100644 --- a/init/main.c +++ b/init/main.c @@ -504,6 +504,7 @@ asmlinkage void __init start_kernel(void) * These use large bootmem allocations and must precede * kmem_cache_init() */ + setup_log_buf(0); pidhash_init(); vfs_caches_init_early(); sort_main_extable(); diff --git a/kernel/printk.c b/kernel/printk.c index da8ca817eae..35185392173 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -167,46 +168,74 @@ void log_buf_kexec_setup(void) } #endif +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { unsigned size = memparse(str, &str); - unsigned long flags; if (size) size = roundup_pow_of_two(size); - if (size > log_buf_len) { - unsigned start, dest_idx, offset; - char *new_log_buf; + if (size > log_buf_len) + new_log_buf_len = size; - new_log_buf = alloc_bootmem(size); - if (!new_log_buf) { - printk(KERN_WARNING "log_buf_len: allocation failed\n"); - goto out; - } + return 0; +} +early_param("log_buf_len", log_buf_len_setup); - spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = size; - log_buf = new_log_buf; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); +void __init setup_log_buf(int early) +{ + unsigned long flags; + unsigned start, dest_idx, offset; + char *new_log_buf; + int free; + + if (!new_log_buf_len) + return; + + if (early) { + unsigned long mem; - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); + mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); + if (mem == MEMBLOCK_ERROR) + return; + new_log_buf = __va(mem); + } else { + new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); } -out: - return 1; -} -__setup("log_buf_len=", log_buf_len_setup); + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %ld bytes not available\n", + new_log_buf_len); + return; + } + + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; + free = __LOG_BUF_LEN - log_end; + + offset = start = min(con_start, log_start); + dest_idx = 0; + while (start != log_end) { + unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); + + log_buf[dest_idx] = __log_buf[log_idx_mask]; + start++; + dest_idx++; + } + log_start -= offset; + con_start -= offset; + log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("early log buf free: %d(%d%%)\n", + free, (free * 100) / __LOG_BUF_LEN); +} #ifdef CONFIG_BOOT_PRINTK_DELAY -- cgit v1.2.3-70-g09d2 From 0666fb51b1483f27506e212cc7f7b2645b5c7acc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 25 May 2011 19:20:21 +0200 Subject: ptrace: ptrace_resume() shouldn't wake up !TASK_TRACED thread It is not clear why ptrace_resume() does wake_up_process(). Unless the caller is PTRACE_KILL the tracee should be TASK_TRACED so we can use wake_up_state(__TASK_TRACED). If sys_ptrace() races with SIGKILL we do not need the extra and potentionally spurious wakeup. If the caller is PTRACE_KILL, wake_up_process() is even more wrong. The tracee can sleep in any state in any place, and if we have a buggy code which doesn't handle a spurious wakeup correctly PTRACE_KILL can be used to exploit it. For example: int main(void) { int child, status; child = fork(); if (!child) { int ret; assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); ret = pause(); printf("pause: %d %m\n", ret); return 0x23; } sleep(1); assert(ptrace(PTRACE_KILL, child, 0,0) == 0); assert(child == wait(&status)); printf("wait: %x\n", status); return 0; } prints "pause: -1 Unknown error 514", -ERESTARTNOHAND leaks to the userland. In this case sys_pause() is buggy as well and should be fixed. I do not know what was the original rationality behind PTRACE_KILL. The man page is simply wrong and afaics it was always wrong. Imho it should be deprecated, or may be it should do send_sig(SIGKILL) as Denys suggests, but in any case I do not think that the current behaviour was intentional. Note: there is another problem, ptrace_resume() changes ->exit_code and this can race with SIGKILL too. Eventually we should change ptrace to not use ->exit_code. Signed-off-by: Oleg Nesterov --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7a81fc07134..2df115790cd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -562,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, } child->exit_code = data; - wake_up_process(child); + wake_up_state(child, __TASK_TRACED); return 0; } -- cgit v1.2.3-70-g09d2 From d92fcf0552a15891b25c343cee340d295e24109c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 25 May 2011 19:22:27 +0200 Subject: signal: sys_pause() should check signal_pending() ERESTART* is always wrong without TIF_SIGPENDING. Teach sys_pause() to handle the spurious wakeup correctly. Signed-off-by: Oleg Nesterov --- kernel/signal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ad5e818baac..86c32b884f8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3023,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { - current->state = TASK_INTERRUPTIBLE; - schedule(); + while (!signal_pending(current)) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } return -ERESTARTNOHAND; } -- cgit v1.2.3-70-g09d2 From 90ff1f30c0f401e325d6b2747618b7e3a0addaf8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 May 2011 23:08:17 +0200 Subject: hrtimers: Fix typo causing erratic timers commit 9ec2690758a5 ("timerfd: Manage cancelable timers in timerfd") introduced a CONFIG_HIGHRES_TIMERS (should be CONFIG_HIGH_RES_TIMERS) typo, which caused applications depending on CLOCK_REALTIME timers to become sluggy due to the fact that the time base of the realtime timers was not updated when the wall clock time was set. This causes anything from 100% CPU use for some applications to odd delays and hickups. Reported-bisected-and-tested-by: Anca Emanuel Tested-by: Linus Torvalds Fatfingered-by: Thomas Gleixner Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c541ee527ec..a9205e32a05 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -748,7 +748,7 @@ static inline void retrigger_next_event(void *arg) { } */ void clock_was_set(void) { -#ifdef CONFIG_HIGHRES_TIMERS +#ifdef CONFIG_HIGH_RES_TIMERS /* Retrigger the CPU local events everywhere */ on_each_cpu(retrigger_next_event, NULL, 1); #endif -- cgit v1.2.3-70-g09d2 From 7cbc5b8d4a775a43875a09e29c49a2a8195b5b2d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2011 12:43:46 +0200 Subject: jump_label: Check entries limit in __jump_label_update When iterating the jump_label entries array (core or modules), the __jump_label_update function peeks over the last entry. The reason is that the end of the for loop depends on the key value of the processed entry. Thus when going through the last array entry, we will touch the memory behind the array limit. This bug probably will never be triggered, since most likely the memory behind the jump_label entries will be accesable and the entry->key will be different than the expected value. Signed-off-by: Jiri Olsa Acked-by: Jason Baron Link: http://lkml.kernel.org/r/20110510104346.GC1899@jolsa.brq.redhat.com Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 74d1c099fbd..fa27e750dbc 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, } static void __jump_label_update(struct jump_label_key *key, - struct jump_entry *entry, int enable) + struct jump_entry *entry, + struct jump_entry *stop, int enable) { - for (; entry->key == (jump_label_t)(unsigned long)key; entry++) { + for (; (entry < stop) && + (entry->key == (jump_label_t)(unsigned long)key); + entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel @@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable) struct jump_label_mod *mod = key->next; while (mod) { - __jump_label_update(key, mod->entries, enable); + struct module *m = mod->mod; + + __jump_label_update(key, mod->entries, + m->jump_entries + m->num_jump_entries, + enable); mod = mod->next; } } @@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod) key->next = jlm; if (jump_label_enabled(key)) - __jump_label_update(key, iter, JUMP_LABEL_ENABLE); + __jump_label_update(key, iter, iter_stop, + JUMP_LABEL_ENABLE); } return 0; @@ -371,7 +379,7 @@ static void jump_label_update(struct jump_label_key *key, int enable) /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, enable); + __jump_label_update(key, entry, __stop___jump_table, enable); #ifdef CONFIG_MODULES __jump_label_mod_update(key, enable); -- cgit v1.2.3-70-g09d2 From a1cd6173596c6f7d1f0b41ac7d33ecf03c581edc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 May 2011 15:24:25 -0400 Subject: ftrace: Have ftrace_startup() return failure code The register_ftrace_function() returns an error code on failure except if the call to ftrace_startup() fails. Add a error return to ftrace_startup() if it fails to start, allowing register_ftrace_funtion() to return a proper error value. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d017c2c82c4..bebbc959ee8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1638,12 +1638,12 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } -static void ftrace_startup(struct ftrace_ops *ops, int command) +static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; @@ -1662,6 +1662,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command) ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); + + return 0; } static void ftrace_shutdown(struct ftrace_ops *ops, int command) @@ -2501,7 +2503,7 @@ static void __enable_ftrace_function_probe(void) ret = __register_ftrace_function(&trace_probe_ops); if (!ret) - ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } @@ -3466,7 +3468,7 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) do { } while (0) +# define ftrace_startup(ops, command) ({0;}) # define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -3799,7 +3801,7 @@ int register_ftrace_function(struct ftrace_ops *ops) ret = __register_ftrace_function(ops); if (!ret) - ftrace_startup(ops, 0); + ret = ftrace_startup(ops, 0); out_unlock: @@ -4045,7 +4047,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); -- cgit v1.2.3-70-g09d2 From 17bb615ad4f8d2d2c0f02794d27d7f83e0009ef4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 May 2011 15:27:46 -0400 Subject: tracing: Have event with function tracer check error return The self tests for event tracer does not check if the function tracing was successfully activated. It needs to before it continues the tests, otherwise the wrong errors may be reported. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2fe11034135..686ec399f2a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { - register_ftrace_function(&trace_ops); + int ret; + ret = register_ftrace_function(&trace_ops); + if (WARN_ON(ret < 0)) { + pr_info("Failed to enable function tracer for event tests\n"); + return; + } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); -- cgit v1.2.3-70-g09d2 From 3b6cfdb1714a33ae4d2ca9fbc818a42cf7adee69 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 May 2011 15:33:49 -0400 Subject: ftrace: Set ops->flag to enabled even on static function tracing When dynamic ftrace is not configured, the ops->flags still needs to have its FTRACE_OPS_FL_ENABLED bit set in ftrace_startup(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bebbc959ee8..25949b33057 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3468,7 +3468,11 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) ({0;}) +# define ftrace_startup(ops, command) \ + ({ \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + 0; \ + }) # define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -- cgit v1.2.3-70-g09d2 From 2fc1b6f0d0a719e1e2a30bf076a3a799feaf6af2 Mon Sep 17 00:00:00 2001 From: liubo Date: Tue, 19 Apr 2011 09:35:28 +0800 Subject: tracing: Add __print_symbolic_u64 to avoid warnings on 32bit machine Filesystem, like Btrfs, has some "ULL" macros, and when these macros are passed to tracepoints'__print_symbolic(), there will be 64->32 truncate WARNINGS during compiling on 32bit box. Signed-off-by: Liu Bo Link: http://lkml.kernel.org/r/4DACE6E0.7000507@cn.fujitsu.com Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 12 ++++++++++++ include/trace/ftrace.h | 13 +++++++++++++ kernel/trace/trace_output.c | 27 +++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index b5a550a39a7..59d3ef100eb 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,6 +16,11 @@ struct trace_print_flags { const char *name; }; +struct trace_print_flags_u64 { + unsigned long long mask; + const char *name; +}; + const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); @@ -23,6 +28,13 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); +#if BITS_PER_LONG == 32 +const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, + unsigned long long val, + const struct trace_print_flags_u64 + *symbol_array); +#endif + const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3e68366d485..533c49f4804 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -205,6 +205,19 @@ ftrace_print_symbols_seq(p, value, symbols); \ }) +#undef __print_symbolic_u64 +#if BITS_PER_LONG == 32 +#define __print_symbolic_u64(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags_u64 symbols[] = \ + { symbol_array, { -1, NULL } }; \ + ftrace_print_symbols_seq_u64(p, value, symbols); \ + }) +#else +#define __print_symbolic_u64(value, symbol_array...) \ + __print_symbolic(value, symbol_array) +#endif + #undef __print_hex #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cf535ccedc8..e37de492a9e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, } EXPORT_SYMBOL(ftrace_print_symbols_seq); +#if BITS_PER_LONG == 32 +const char * +ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, + const struct trace_print_flags_u64 *symbol_array) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%llx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +#endif + const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { -- cgit v1.2.3-70-g09d2 From b1cff0ad1062621ae63cb6c5dc4165191fe2e9f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 May 2011 14:27:43 -0400 Subject: ftrace: Add internal recursive checks Witold reported a reboot caused by the selftests of the dynamic function tracer. He sent me a config and I used ktest to do a config_bisect on it (as my config did not cause the crash). It pointed out that the problem config was CONFIG_PROVE_RCU. What happened was that if multiple callbacks are attached to the function tracer, we iterate a list of callbacks. Because the list is managed by synchronize_sched() and preempt_disable, the access to the pointers uses rcu_dereference_raw(). When PROVE_RCU is enabled, the rcu_dereference_raw() calls some debugging functions, which happen to be traced. The tracing of the debug function would then call rcu_dereference_raw() which would then call the debug function and then... well you get the idea. I first wrote two different patches to solve this bug. 1) add a __rcu_dereference_raw() that would not do any checks. 2) add notrace to the offending debug functions. Both of these patches worked. Talking with Paul McKenney on IRC, he suggested to add recursion detection instead. This seemed to be a better solution, so I decided to implement it. As the task_struct already has a trace_recursion to detect recursion in the ring buffer, and that has a very small number it allows, I decided to use that same variable to add flags that can detect the recursion inside the infrastructure of the function tracer. I plan to change it so that the task struct bit can be checked in mcount, but as that requires changes to all archs, I will hold that off to the next merge window. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Frederic Weisbecker Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1306348063.1465.116.camel@gandalf.stny.rr.com Reported-by: Witold Baryluk Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 +- kernel/trace/ftrace.c | 13 ++++++++++++- kernel/trace/ring_buffer.c | 10 +++++----- kernel/trace/trace.h | 15 +++++++++++++++ 4 files changed, 33 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d8b2d0bec0d..7b78d9cad47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1513,7 +1513,7 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; - /* bitmask of trace recursion */ + /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25949b33057..1ee417fcbfa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + return; + trace_recursion_set(TRACE_GLOBAL_BIT); + op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { op->func(ip, parent_ip); op = rcu_dereference_raw(op->next); /*see above*/ }; + trace_recursion_clear(TRACE_GLOBAL_BIT); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) @@ -3490,6 +3496,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op; + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) + return; + + trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). @@ -3502,6 +3512,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); + trace_recursion_clear(TRACE_INTERNAL_BIT); } static void clear_ftrace_swapper(void) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0ef7b4b2a1f..b0c7aa40794 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void) printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", - current->trace_recursion, + trace_recursion_buffer(), hardirq_count() >> HARDIRQ_SHIFT, softirq_count() >> SOFTIRQ_SHIFT, in_nmi()); @@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void) static inline int trace_recursive_lock(void) { - current->trace_recursion++; + trace_recursion_inc(); - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) return 0; trace_recursive_fail(); @@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void) static inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!current->trace_recursion); + WARN_ON_ONCE(!trace_recursion_buffer()); - current->trace_recursion--; + trace_recursion_dec(); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b69c4bd306..229f8591f61 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3-70-g09d2 From def945eeb920b94e710574454043f080831aefe5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 May 2011 22:09:40 -0700 Subject: irq: Remove smp_affinity_list when unregister irq proc commit 4b06042(bitmap, irq: add smp_affinity_list interface to /proc/irq) causes the following warning: [ 274.239500] WARNING: at fs/proc/generic.c:850 remove_proc_entry+0x24c/0x27a() [ 274.251761] remove_proc_entry: removing non-empty directory 'irq/184', leaking at least 'smp_affinity_list' Remove the new file in the exit path. Signed-off-by: Yinghai Lu Cc: Mike Travis Link: http://lkml.kernel.org/r/4DDDE094.6050505@kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 64e3df6ab1e..4bd4faa6323 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -352,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP remove_proc_entry("smp_affinity", desc->dir); remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); #endif remove_proc_entry("spurious", desc->dir); -- cgit v1.2.3-70-g09d2 From 0bbcc529fcea9c7de5e2e7243f9913b8f7302a8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2011 02:24:04 -0700 Subject: rcu: Add memory barriers Add the memory barriers added by e59fb3120b. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e486f7c3ffb..3731141d8ad 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -907,6 +907,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) unsigned long gp_duration; WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + + /* + * Ensure that all grace-period and pre-grace-period activity + * is seen before the assignment to rsp->completed. + */ + smp_mb(); /* See above block comment. */ gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; -- cgit v1.2.3-70-g09d2 From 1135633bddcf7a819a1490c18d04965c490bcc1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2011 02:44:06 -0700 Subject: rcu: Remove old memory barriers from rcu_process_callbacks() Second step of partitioning of commit e59fb3120b. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3731141d8ad..011bf6f261a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1460,25 +1460,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(void) { - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be seen before any RCU - * grace-period manipulations below. - */ - smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be seen after any RCU - * grace-period manipulations above. - */ - smp_mb(); /* See above block comment. */ - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ rcu_needs_cpu_flush(); } -- cgit v1.2.3-70-g09d2 From b5904090c754327ed6c2ecaefed4f7d473df393f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2011 02:52:04 -0700 Subject: rcu: Don't do reschedule unless in irq Condition the set_need_resched() in rcu_irq_exit() on in_irq(). This should be a no-op, because rcu_irq_exit() should only be called from irq. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 011bf6f261a..195b3a3313e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -421,8 +421,9 @@ void rcu_irq_exit(void) WARN_ON_ONCE(rdtp->dynticks & 0x1); /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__this_cpu_read(rcu_sched_data.nxtlist) || - __this_cpu_read(rcu_bh_data.nxtlist)) + if (in_irq() && + (__this_cpu_read(rcu_sched_data.nxtlist) || + __this_cpu_read(rcu_bh_data.nxtlist))) set_need_resched(); } -- cgit v1.2.3-70-g09d2 From 4305ce7894dd38b0633bfc8978437320119223bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2011 14:27:31 -0700 Subject: rcu: Make rcu_enter_nohz() pay attention to nesting The old version of rcu_enter_nohz() forced RCU into nohz mode even if the nesting count was non-zero. This change causes rcu_enter_nohz() to hold off for non-zero nesting counts. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 195b3a3313e..99c6038ad04 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -324,8 +324,8 @@ void rcu_enter_nohz(void) smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting--; + if (--rdtp->dynticks_nesting == 0) + rdtp->dynticks++; WARN_ON_ONCE(rdtp->dynticks & 0x1); local_irq_restore(flags); } -- cgit v1.2.3-70-g09d2 From 23b5c8fa01b723c70a20d6e4ef4ff54c7656d6e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Sep 2010 10:38:22 -0700 Subject: rcu: Decrease memory-barrier usage based on semi-formal proof (Note: this was reverted, and is now being re-applied in pieces, with this being the fifth and final piece. See below for the reason that it is now felt to be safe to re-apply this.) Commit d09b62d fixed grace-period synchronization, but left some smp_mb() invocations in rcu_process_callbacks() that are no longer needed, but sheer paranoia prevented them from being removed. This commit removes them and provides a proof of correctness in their absence. It also adds a memory barrier to rcu_report_qs_rsp() immediately before the update to rsp->completed in order to handle the theoretical possibility that the compiler or CPU might move massive quantities of code into a lock-based critical section. This also proves that the sheer paranoia was not entirely unjustified, at least from a theoretical point of view. In addition, the old dyntick-idle synchronization depended on the fact that grace periods were many milliseconds in duration, so that it could be assumed that no dyntick-idle CPU could reorder a memory reference across an entire grace period. Unfortunately for this design, the addition of expedited grace periods breaks this assumption, which has the unfortunate side-effect of requiring atomic operations in the functions that track dyntick-idle state for RCU. (There is some hope that the algorithms used in user-level RCU might be applied here, but some work is required to handle the NMIs that user-space applications can happily ignore. For the short term, better safe than sorry.) This proof assumes that neither compiler nor CPU will allow a lock acquisition and release to be reordered, as doing so can result in deadlock. The proof is as follows: 1. A given CPU declares a quiescent state under the protection of its leaf rcu_node's lock. 2. If there is more than one level of rcu_node hierarchy, the last CPU to declare a quiescent state will also acquire the ->lock of the next rcu_node up in the hierarchy, but only after releasing the lower level's lock. The acquisition of this lock clearly cannot occur prior to the acquisition of the leaf node's lock. 3. Step 2 repeats until we reach the root rcu_node structure. Please note again that only one lock is held at a time through this process. The acquisition of the root rcu_node's ->lock must occur after the release of that of the leaf rcu_node. 4. At this point, we set the ->completed field in the rcu_state structure in rcu_report_qs_rsp(). However, if the rcu_node hierarchy contains only one rcu_node, then in theory the code preceding the quiescent state could leak into the critical section. We therefore precede the update of ->completed with a memory barrier. All CPUs will therefore agree that any updates preceding any report of a quiescent state will have happened before the update of ->completed. 5. Regardless of whether a new grace period is needed, rcu_start_gp() will propagate the new value of ->completed to all of the leaf rcu_node structures, under the protection of each rcu_node's ->lock. If a new grace period is needed immediately, this propagation will occur in the same critical section that ->completed was set in, but courtesy of the memory barrier in #4 above, is still seen to follow any pre-quiescent-state activity. 6. When a given CPU invokes __rcu_process_gp_end(), it becomes aware of the end of the old grace period and therefore makes any RCU callbacks that were waiting on that grace period eligible for invocation. If this CPU is the same one that detected the end of the grace period, and if there is but a single rcu_node in the hierarchy, we will still be in the single critical section. In this case, the memory barrier in step #4 guarantees that all callbacks will be seen to execute after each CPU's quiescent state. On the other hand, if this is a different CPU, it will acquire the leaf rcu_node's ->lock, and will again be serialized after each CPU's quiescent state for the old grace period. On the strength of this proof, this commit therefore removes the memory barriers from rcu_process_callbacks() and adds one to rcu_report_qs_rsp(). The effect is to reduce the number of memory barriers by one and to reduce the frequency of execution from about once per scheduling tick per CPU to once per grace period. This was reverted do to hangs found during testing by Yinghai Lu and Ingo Molnar. Frederic Weisbecker supplied Yinghai with tracing that located the underlying problem, and Frederic also provided the fix. The underlying problem was that the HARDIRQ_ENTER() macro from lib/locking-selftest.c invoked irq_enter(), which in turn invokes rcu_irq_enter(), but HARDIRQ_EXIT() invoked __irq_exit(), which does not invoke rcu_irq_exit(). This situation resulted in calls to rcu_irq_enter() that were not balanced by the required calls to rcu_irq_exit(). Therefore, after these locking selftests completed, RCU's dyntick-idle nesting count was a large number (for example, 72), which caused RCU to to conclude that the affected CPU was not in dyntick-idle mode when in fact it was. RCU would therefore incorrectly wait for this dyntick-idle CPU, resulting in hangs. In contrast, with Frederic's patch, which replaces the irq_enter() in HARDIRQ_ENTER() with an __irq_enter(), these tests don't ever call either rcu_irq_enter() or rcu_irq_exit(), which works because the CPU running the test is already marked as not being in dyntick-idle mode. This means that the rcu_irq_enter() and rcu_irq_exit() calls and RCU then has no problem working out which CPUs are in dyntick-idle mode and which are not. The reason that the imbalance was not noticed before the barrier patch was applied is that the old implementation of rcu_enter_nohz() ignored the nesting depth. This could still result in delays, but much shorter ones. Whenever there was a delay, RCU would IPI the CPU with the unbalanced nesting level, which would eventually result in rcu_enter_nohz() being called, which in turn would force RCU to see that the CPU was in dyntick-idle mode. The reason that very few people noticed the problem is that the mismatched irq_enter() vs. __irq_exit() occured only when the kernel was built with CONFIG_DEBUG_LOCKING_API_SELFTESTS. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 17 ++----- kernel/rcutree.c | 111 ++++++++++++++++++++------------------------ kernel/rcutree.h | 9 ++-- kernel/rcutree_plugin.h | 7 ++- kernel/rcutree_trace.c | 12 ++--- 5 files changed, 67 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index c078ad48f7a..8173cec473a 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -99,18 +99,11 @@ o "qp" indicates that RCU still expects a quiescent state from o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the - scheduler or by irq. The number after the "/" is the interrupt - nesting depth when in dyntick-idle state, or one greater than - the interrupt-nesting depth otherwise. - - This field is displayed only for CONFIG_NO_HZ kernels. - -o "dn" is the current value of the dyntick counter that is incremented - when entering or leaving dynticks idle state via NMI. If both - the "dt" and "dn" values are even, then this CPU is in dynticks - idle mode and may be ignored by RCU. If either of these two - counters is odd, then RCU must be alert to the possibility of - an RCU read-side critical section running on this CPU. + scheduler or by irq. This number is even if the CPU is in + dyntick idle mode and odd otherwise. The number after the first + "/" is the interrupt nesting depth when in dyntick-idle state, + or one greater than the interrupt-nesting depth otherwise. + The number after the second "/" is the NMI nesting depth. This field is displayed only for CONFIG_NO_HZ kernels. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 99c6038ad04..5616b17e4a2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, - .dynticks = 1, + .dynticks = ATOMIC_INIT(1), }; #endif /* #ifdef CONFIG_NO_HZ */ @@ -321,13 +321,25 @@ void rcu_enter_nohz(void) unsigned long flags; struct rcu_dynticks *rdtp; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting == 0) - rdtp->dynticks++; - WARN_ON_ONCE(rdtp->dynticks & 0x1); + if (--rdtp->dynticks_nesting) { + local_irq_restore(flags); + return; + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (in_irq() && + (__get_cpu_var(rcu_sched_data).nxtlist || + __get_cpu_var(rcu_bh_data).nxtlist || + rcu_preempt_needs_cpu(smp_processor_id()))) + set_need_resched(); } /* @@ -343,11 +355,16 @@ void rcu_exit_nohz(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); + if (rdtp->dynticks_nesting++) { + local_irq_restore(flags); + return; + } + smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); local_irq_restore(flags); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -361,11 +378,15 @@ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 && + (atomic_read(&rdtp->dynticks) & 0x1)) return; - rdtp->dynticks_nmi++; - WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rdtp->dynticks_nmi_nesting++; + smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } /** @@ -379,11 +400,14 @@ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 || + --rdtp->dynticks_nmi_nesting != 0) return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks_nmi++; - WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force delay to next write. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } /** @@ -394,13 +418,7 @@ void rcu_nmi_exit(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (rdtp->dynticks_nesting++) - return; - rdtp->dynticks++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rcu_exit_nohz(); } /** @@ -412,19 +430,7 @@ void rcu_irq_enter(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (--rdtp->dynticks_nesting) - return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks++; - WARN_ON_ONCE(rdtp->dynticks & 0x1); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (in_irq() && - (__this_cpu_read(rcu_sched_data.nxtlist) || - __this_cpu_read(rcu_bh_data.nxtlist))) - set_need_resched(); + rcu_enter_nohz(); } #ifdef CONFIG_SMP @@ -436,19 +442,8 @@ void rcu_irq_exit(void) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - int ret; - int snap; - int snap_nmi; - - snap = rdp->dynticks->dynticks; - snap_nmi = rdp->dynticks->dynticks_nmi; - smp_mb(); /* Order sampling of snap with end of grace period. */ - rdp->dynticks_snap = snap; - rdp->dynticks_nmi_snap = snap_nmi; - ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); - if (ret) - rdp->dynticks_fqs++; - return ret; + rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + return 0; } /* @@ -459,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - long curr; - long curr_nmi; - long snap; - long snap_nmi; + unsigned long curr; + unsigned long snap; - curr = rdp->dynticks->dynticks; - snap = rdp->dynticks_snap; - curr_nmi = rdp->dynticks->dynticks_nmi; - snap_nmi = rdp->dynticks_nmi_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned long)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -478,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr != snap || (curr & 0x1) == 0) && - (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { + if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { rdp->dynticks_fqs++; return 1; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 257664815d5..93d4a1c2e88 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,11 +84,9 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track nesting level, sort of. */ - int dynticks; /* Even value for dynticks-idle, else odd. */ - int dynticks_nmi; /* Even value for either dynticks-idle or */ - /* not in nmi handler, else odd. So this */ - /* remains even for nmi from irq handler. */ + int dynticks_nesting; /* Track irq/process nesting level. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -284,7 +282,6 @@ struct rcu_data { /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ #endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3f6559a5f5c..ed339702481 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1520,7 +1520,6 @@ int rcu_needs_cpu(int cpu) { int c = 0; int snap; - int snap_nmi; int thatcpu; /* Check for being in the holdoff period. */ @@ -1531,10 +1530,10 @@ int rcu_needs_cpu(int cpu) for_each_online_cpu(thatcpu) { if (thatcpu == cpu) continue; - snap = per_cpu(rcu_dynticks, thatcpu).dynticks; - snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; + snap = atomic_add_return(0, &per_cpu(rcu_dynticks, + thatcpu).dynticks); smp_mb(); /* Order sampling of snap with end of grace period. */ - if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { + if ((snap & 0x1) != 0) { per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; return rcu_needs_cpu_quick_check(cpu); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index aa0fd72b4bc..9678cc3650f 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); #ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d dn=%d df=%lu", - rdp->dynticks->dynticks, + seq_printf(m, " dt=%d/%d/%d df=%lu", + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", - rdp->dynticks->dynticks, + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); #ifdef CONFIG_NO_HZ - seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.2.3-70-g09d2 From 0775a60aca2375ea5598741b30d13fe6d3f15617 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 May 2011 00:05:23 +0200 Subject: PM: Fix PM QOS's user mode interface to work with ASCII input Make pm_qos_power_write() accept values passed to it in the ASCII hex format either with or without an ending newline. Signed-off-by: Rafael J. Wysocki Acked-by: Mark Gross --- kernel/pm_qos_params.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0da058bff8e..68df076fec5 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -404,24 +405,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - int x; - char ascii_value[11]; struct pm_qos_request_list *pm_qos_req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count == 11) { /* len('0x12345678/0') */ - if (copy_from_user(ascii_value, buf, 11)) + } else if (count <= 11) { /* ASCII perhaps? */ + char ascii_value[11]; + unsigned long int ulval; + int ret; + + if (copy_from_user(ascii_value, buf, count)) return -EFAULT; - if (strlen(ascii_value) != 10) - return -EINVAL; - x = sscanf(ascii_value, "%x", &value); - if (x != 1) + + if (count > 10) { + if (ascii_value[10] == '\n') + ascii_value[10] = '\0'; + else + return -EINVAL; + } else { + ascii_value[count] = '\0'; + } + ret = strict_strtoul(ascii_value, 16, &ulval); + if (ret) { + pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); return -EINVAL; - pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); - } else + } + value = (s32)lower_32_bits(ulval); + } else { return -EINVAL; + } pm_qos_req = filp->private_data; pm_qos_update_request(pm_qos_req, value); -- cgit v1.2.3-70-g09d2 From 4714d1d32d97239fb5ae3e10521d3f133a899b66 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:18 -0700 Subject: cgroups: read-write lock CLONE_THREAD forking per threadgroup Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup Add an rwsem that lives in a threadgroup's signal_struct that's taken for reading in the fork path, under CONFIG_CGROUPS. If another part of the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS ifdefs should be changed to a higher-up flag that CGROUPS and the other system would both depend on. This is a pre-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 9 +++++++++ include/linux/sched.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/fork.c | 10 ++++++++++ 3 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bafc58c00fc..580f70c0239 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -22,6 +22,14 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; +#ifdef CONFIG_CGROUPS +#define INIT_THREADGROUP_FORK_LOCK(sig) \ + .threadgroup_fork_lock = \ + __RWSEM_INITIALIZER(sig.threadgroup_fork_lock), +#else +#define INIT_THREADGROUP_FORK_LOCK(sig) +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ @@ -38,6 +46,7 @@ extern struct fs_struct init_fs; }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ + INIT_THREADGROUP_FORK_LOCK(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index f18300eddfc..dc8871295a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -513,6 +513,7 @@ struct thread_group_cputimer { spinlock_t lock; }; +#include struct autogroup; /* @@ -632,6 +633,16 @@ struct signal_struct { unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + /* + * The threadgroup_fork_lock prevents threads from forking with + * CLONE_THREAD while held for writing. Use this for fork-sensitive + * threadgroup-wide operations. It's taken for reading in fork.c in + * copy_process(). + * Currently only needed write-side by cgroups. + */ + struct rw_semaphore threadgroup_fork_lock; +#endif int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ @@ -2323,6 +2334,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } +/* See the declaration of threadgroup_fork_lock in signal_struct. */ +#ifdef CONFIG_CGROUPS +static inline void threadgroup_fork_read_lock(struct task_struct *tsk) +{ + down_read(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) +{ + up_read(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_write_lock(struct task_struct *tsk) +{ + down_write(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) +{ + up_write(&tsk->signal->threadgroup_fork_lock); +} +#else +static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {} +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {} +static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {} +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {} +#endif + #ifndef __HAVE_THREAD_FUNCTIONS #define task_thread_info(task) ((struct thread_info *)(task)->stack) diff --git a/kernel/fork.c b/kernel/fork.c index 8e7e135d081..1fa9d940e30 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -957,6 +957,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1138,6 +1142,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1342,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; @@ -1380,6 +1388,8 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); -- cgit v1.2.3-70-g09d2 From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:19 -0700 Subject: cgroups: add per-thread subsystem callbacks Add cgroup subsystem callbacks for per-thread attachment in atomic contexts Add can_attach_task(), pre_attach(), and attach_task() as new callbacks for cgroups's subsystem interface. Unlike can_attach and attach, these are for per-thread operations, to be called potentially many times when attaching an entire threadgroup. Also, the old "bool threadgroup" interface is removed, as replaced by this. All subsystems are modified for the new interface - of note is cpuset, which requires from/to nodemasks for attach to be globally scoped (though per-cpuset would work too) to persist from its pre_attach to attach_task and attach. This is a pre-patch for cgroup-procs-writable.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 30 ++++++++---- block/blk-cgroup.c | 18 +++----- include/linux/cgroup.h | 10 ++-- kernel/cgroup.c | 17 +++++-- kernel/cgroup_freezer.c | 26 ++++------- kernel/cpuset.c | 96 +++++++++++++++++++-------------------- kernel/sched.c | 38 ++-------------- mm/memcontrol.c | 18 +++----- security/device_cgroup.c | 3 +- 9 files changed, 114 insertions(+), 142 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index aedf1bd02fd..b3bd3bdbe20 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -575,7 +575,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be called multiple times against a cgroup. int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *task, bool threadgroup) + struct task_struct *task) (cgroup_mutex held by caller) Called prior to moving a task into a cgroup; if the subsystem @@ -584,9 +584,14 @@ task is passed, then a successful result indicates that *any* unspecified task can be moved into the cgroup. Note that this isn't called on a fork. If this method returns 0 (success) then this should remain valid while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. If threadgroup is -true, then a successful result indicates that all threads in the given -thread's threadgroup can be moved together. +attach() or cancel_attach() will be called in future. + +int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk); +(cgroup_mutex held by caller) + +As can_attach, but for operations that must be run once per task to be +attached (possibly many when using cgroup_attach_proc). Called after +can_attach. void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *task, bool threadgroup) @@ -598,15 +603,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary. This will be called only about subsystems whose can_attach() operation have succeeded. +void pre_attach(struct cgroup *cgrp); +(cgroup_mutex held by caller) + +For any non-per-thread attachment work that needs to happen before +attach_task. Needed by cpuset. + void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) + struct cgroup *old_cgrp, struct task_struct *task) (cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. -If threadgroup is true, the subsystem should take care of all threads -in the specified thread's threadgroup. Currently does not support any + +void attach_task(struct cgroup *cgrp, struct task_struct *tsk); +(cgroup_mutex held by caller) + +As attach, but for operations that must be run once per task to be attached, +like can_attach_task. Called before attach. Currently does not support any subsystem that might need the old_cgrp for every thread in the group. void fork(struct cgroup_subsy *ss, struct task_struct *task) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 07371cfdfae..bcaf16ee6ad 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup); static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct task_struct *, bool); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup *, struct task_struct *, bool); +static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); +static void blkiocg_attach_task(struct cgroup *, struct task_struct *); static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); @@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, + .can_attach_task = blkiocg_can_attach_task, + .attach_task = blkiocg_attach_task, .destroy = blkiocg_destroy, .populate = blkiocg_populate, #ifdef CONFIG_BLK_CGROUP @@ -1616,9 +1614,7 @@ done: * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup_subsys *subsys, - struct cgroup *cgroup, struct task_struct *tsk, - bool threadgroup) +static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; int ret = 0; @@ -1633,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys, return ret; } -static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct cgroup *prev, struct task_struct *tsk, - bool threadgroup) +static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5ac7ebc36db..1e6cde21fa3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -467,12 +467,14 @@ struct cgroup_subsys { int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup); + struct task_struct *tsk); + int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup); + struct task_struct *tsk); + void (*pre_attach)(struct cgroup *cgrp); + void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *tsk, - bool threadgroup); + struct cgroup *old_cgrp, struct task_struct *tsk); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af..38fb0ad1cb4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1759,7 +1759,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1771,6 +1771,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } task_lock(tsk); @@ -1805,8 +1812,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1829,7 +1840,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c3..e691818d7e4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) + struct task_struct *task) { struct freezer *freezer; @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state != CGROUP_THAWED) return -EBUSY; + return 0; +} + +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ rcu_read_lock(); - if (__cgroup_freezing_or_frozen(task)) { + if (__cgroup_freezing_or_frozen(tsk)) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); - - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (__cgroup_freezing_or_frozen(c)) { - rcu_read_unlock(); - return -EBUSY; - } - } - rcu_read_unlock(); - } - return 0; } @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, + .can_attach_task = freezer_can_attach_task, + .pre_attach = NULL, + .attach_task = NULL, .attach = NULL, .fork = freezer_fork, .exit = NULL, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2bb8c2e98ff..55b297d78ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Protected by cgroup_lock */ -static cpumask_var_t cpus_attach; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk, bool threadgroup) + struct task_struct *tsk) { - int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk); - if (ret) - return ret; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c); - if (ret) { - rcu_read_unlock(); - return ret; - } - } - rcu_read_unlock(); - } return 0; } -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, - struct cpuset *cs) +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in pre_attach, and they must + * persist among pre_attach, attach_task, and attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + +/* Set-up work for before attaching each task. */ +static void cpuset_pre_attach(struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +} + +/* Per-thread attachment work. */ +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) { int err; + struct cpuset *cs = cgroup_cs(cont); + /* * can_attach beforehand should guarantee that this doesn't fail. * TODO: have a better way to handle failure here @@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - cpuset_change_task_nodemask(tsk, to); + cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, tsk); - } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk, - bool threadgroup) + struct cgroup *oldcont, struct task_struct *tsk) { struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - static nodemask_t to; /* protected by cgroup_mutex */ - if (cs == &top_cpuset) { - cpumask_copy(cpus_attach, cpu_possible_mask); - } else { - guarantee_online_cpus(cs, cpus_attach); - } - guarantee_online_mems(cs, &to); - - /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); - } - rcu_read_unlock(); - } - - /* change mm; only needs to be done once even if threadgroup */ - to = cs->mems_allowed; + /* + * Change mm, possibly for multiple threads in a threadgroup. This is + * expensive and may sleep. + */ + cpuset_attach_nodemask_from = oldcs->mems_allowed; + cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); + cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + &cpuset_attach_nodemask_to); mmput(mm); } } @@ -1911,6 +1904,9 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, + .can_attach_task = cpuset_can_attach_task, + .pre_attach = cpuset_pre_attach, + .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/sched.c b/kernel/sched.c index 2d12893b8b0..5e43e9dc65d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8764,42 +8764,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) return 0; } -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } - return 0; -} - static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } static void @@ -8887,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5fd3dcd3f2..fc259926c17 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4953,8 +4953,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4993,8 +4992,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -5112,8 +5110,7 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { struct mm_struct *mm; @@ -5131,22 +5128,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 8d9c48f1377..cd1f779fa51 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; static int devcgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task, - bool threadgroup) + struct cgroup *new_cgroup, struct task_struct *task) { if (current != task && !capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3-70-g09d2 From 74a1166dfe1135dcc168d35fa5261aa7e087011b Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:20 -0700 Subject: cgroups: make procs file writable Make procs file writable to move all threads by tgid at once. Add functionality that enables users to move all threads in a threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This current implementation makes use of a per-threadgroup rwsem that's taken for reading in the fork() path to prevent newly forking threads within the threadgroup from "escaping" while the move is in progress. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 9 +- kernel/cgroup.c | 439 ++++++++++++++++++++++++++++++++++---- 2 files changed, 401 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index b3bd3bdbe20..8c4f3466c89 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -236,7 +236,8 @@ containing the following files describing that cgroup: - cgroup.procs: list of tgids in the cgroup. This list is not guaranteed to be sorted or free of duplicate tgids, and userspace should sort/uniquify the list if this property is required. - This is a read-only file, for now. + Writing a thread group id into this file moves all threads in that + group into this cgroup. - notify_on_release flag: run the release agent on exit? - release_agent: the path to use for release notifications (this file exists in the top cgroup only) @@ -430,6 +431,12 @@ You can attach the current shell task by echoing 0: # echo 0 > tasks +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the pid of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + Note: Since every task is always a member of exactly one cgroup in each mounted hierarchy, to remove a task from its current cgroup you must move it into a new cgroup (possibly the root cgroup) by writing to the diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38fb0ad1cb4..5e6a9745f0e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1735,6 +1735,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1815,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1780,36 +1848,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); for_each_subsys(root, ss) { if (ss->pre_attach) @@ -1819,9 +1860,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (ss->attach) ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1871,49 +1911,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct task_struct **group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + group[i] = tsk; + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + retval = ss->can_attach_task(cgrp, group[i]); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = group[i]; + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) + put_task_struct(group[i]); +out_free_group_list: + kfree(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3270,9 +3617,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", -- cgit v1.2.3-70-g09d2 From d846687d7f84e45f23ecf3846dbb43312a1206dd Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:21 -0700 Subject: cgroups: use flex_array in attach_proc Convert cgroup_attach_proc to use flex_array. The cgroup_attach_proc implementation requires a pre-allocated array to store task pointers to atomically move a thread-group, but asking for a monolithic array with kmalloc() may be unreliable for very large groups. Using flex_array provides the same functionality with less risk of failure. This is a post-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e6a9745f0e..00a884342d3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include /* TODO: replace with more sophisticated array */ #include #include +#include /* used in cgroup_attach_proc */ #include @@ -1995,7 +1996,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; - struct task_struct **group; + struct flex_array *group; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2012,9 +2013,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * and if threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); - group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); if (!group) return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ rcu_read_lock(); @@ -2037,7 +2044,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); get_task_struct(tsk); - group[i] = tsk; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ @@ -2059,7 +2071,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (ss->can_attach_task) { /* run on each task in the threadgroup. */ for (i = 0; i < group_size; i++) { - retval = ss->can_attach_task(cgrp, group[i]); + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); if (retval) { failed_ss = ss; cancel_failed_ss = true; @@ -2075,7 +2088,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* nothing to do if this task is already in the cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2114,7 +2127,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) ss->pre_attach(cgrp); } for (i = 0; i < group_size; i++) { - tsk = group[i]; + tsk = flex_array_get_ptr(group, i); /* leave current thread as it is if it's already there */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2167,10 +2180,12 @@ out_cancel_attach: } } /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) - put_task_struct(group[i]); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } out_free_group_list: - kfree(group); + flex_array_free(group); return retval; } -- cgit v1.2.3-70-g09d2 From a77aea92010acf54ad785047234418d5d68772e2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 26 May 2011 16:25:23 -0700 Subject: cgroup: remove the ns_cgroup The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and leads to some problems: * cgroup creation is out-of-control * cgroup name can conflict when pids are looping * it is not possible to have a single process handling a lot of namespaces without falling in a exponential creation time * we may want to create a namespace without creating a cgroup The ns_cgroup was replaced by a compatibility flag 'clone_children', where a newly created cgroup will copy the parent cgroup values. The userspace has to manually create a cgroup and add a task to the 'tasks' file. This patch removes the ns_cgroup as suggested in the following thread: https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html The 'cgroup_clone' function is removed because it is no longer used. This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a printk warning users that the feature is planned for removal. Since that time we have heard from XXX users who were affected by this. Signed-off-by: Daniel Lezcano Signed-off-by: Serge E. Hallyn Cc: Eric W. Biederman Cc: Jamal Hadi Salim Reviewed-by: Li Zefan Acked-by: Paul Menage Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 2 +- arch/mips/configs/bcm47xx_defconfig | 1 - arch/mn10300/configs/asb2364_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/sh/configs/apsh4ad0a_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - arch/sh/configs/se7206_defconfig | 1 - arch/sh/configs/shx3_defconfig | 1 - arch/sh/configs/urquell_defconfig | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - include/linux/cgroup.h | 3 - include/linux/cgroup_subsys.h | 6 -- include/linux/nsproxy.h | 9 --- init/Kconfig | 8 --- kernel/Makefile | 1 - kernel/cgroup.c | 116 -------------------------------- kernel/cpuset.c | 7 +- kernel/fork.c | 6 -- kernel/ns_cgroup.c | 118 --------------------------------- kernel/nsproxy.c | 4 -- 22 files changed, 4 insertions(+), 287 deletions(-) delete mode 100644 kernel/ns_cgroup.c (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 8c4f3466c89..0ed99f08f1f 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -651,7 +651,7 @@ always handled well. void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) (cgroup_mutex held by caller) -Called at the end of cgroup_clone() to do any parameter +Called during cgroup_create() to do any parameter initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index 22fdf2f0cc2..ad15fb10322 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -16,7 +16,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_TINY_RCU=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/mn10300/configs/asb2364_defconfig b/arch/mn10300/configs/asb2364_defconfig index 31d76261a3d..fbb96ae3122 100644 --- a/arch/mn10300/configs/asb2364_defconfig +++ b/arch/mn10300/configs/asb2364_defconfig @@ -8,7 +8,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 214208924a9..04360f9b010 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 7de13865508..c9f212b5f3d 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -15,7 +15,6 @@ CONFIG_AUDITSYSCALL=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index 77ec0e7b8dd..e7583484cc0 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -7,7 +7,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index c41650572d7..8a7dd7b59c5 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -12,7 +12,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index a468ff227fc..72c3fad7383 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -8,7 +8,6 @@ CONFIG_RCU_TRACE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index 3f92d37c637..6bb41303689 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig @@ -9,7 +9,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index 7b3daec6fef..8bfa4d056d7 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -9,7 +9,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_CGROUPS=y CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6f9872658dd..2bf18059fbe 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ee01a9d5d4f..22a0dc8e51d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1e6cde21fa3..ab4ac0ccb85 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -555,9 +555,6 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss, - char *nodename); - /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { struct list_head *cg_link; diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index cdbfcb8780e..ac663c18776 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -19,12 +19,6 @@ SUBSYS(debug) /* */ -#ifdef CONFIG_CGROUP_NS -SUBSYS(ns) -#endif - -/* */ - #ifdef CONFIG_CGROUP_SCHED SUBSYS(cpu_cgroup) #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 7b370c7cfef..50d20aba57d 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -81,13 +81,4 @@ static inline void get_nsproxy(struct nsproxy *ns) atomic_inc(&ns->count); } -#ifdef CONFIG_CGROUP_NS -int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid); -#else -static inline int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid) -{ - return 0; -} -#endif - #endif diff --git a/init/Kconfig b/init/Kconfig index 332aac64996..ebafac4231e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -589,14 +589,6 @@ config CGROUP_DEBUG Say N if unsure. -config CGROUP_NS - bool "Namespace cgroup subsystem" - help - Provides a simple namespace cgroup subsystem to - provide hierarchical naming of sets of namespaces, - for instance virtual servers and checkpoint/restart - jobs. - config CGROUP_FREEZER bool "Freezer cgroup subsystem" help diff --git a/kernel/Makefile b/kernel/Makefile index e9cf19155b4..2d64cfcc8b4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00a884342d3..2731d115d72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4629,122 +4629,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) put_css_set_taskexit(cg); } -/** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; -} - /** * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp * @cgrp: the cgroup in question diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 55b297d78ad..1ceeb049c82 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1802,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) } /* - * post_clone() is called at the end of cgroup_clone(). - * 'cgroup' was just created automatically as a result of - * a cgroup_clone(), and the current task is about to - * be moved into 'cgroup'. + * post_clone() is called during cgroup_create() when the + * clone_children mount argument was specified. The cgroup + * can not yet have any tasks. * * Currently we refuse to set up the cgroup - thereby * refusing the task to be entered, and as a result refusing diff --git a/kernel/fork.c b/kernel/fork.c index 1fa9d940e30..1f84099ecce 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1229,12 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0..00000000000 --- a/kernel/ns_cgroup.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ns_cgroup.c - namespace cgroup subsystem - * - * Copyright 2006, 2007 IBM Corp - */ - -#include -#include -#include -#include -#include -#include - -struct ns_cgroup { - struct cgroup_subsys_state css; -}; - -struct cgroup_subsys ns_subsys; - -static inline struct ns_cgroup *cgroup_to_ns( - struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), - struct ns_cgroup, css); -} - -int ns_cgroup_clone(struct task_struct *task, struct pid *pid) -{ - char name[PROC_NUMBUF]; - - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); - return cgroup_clone(task, &ns_subsys, name); -} - -/* - * Rules: - * 1. you can only enter a cgroup which is a descendant of your current - * cgroup - * 2. you can only place another process into a cgroup if - * a. you have CAP_SYS_ADMIN - * b. your cgroup is an ancestor of task's destination cgroup - * (hence either you are in the same cgroup as task, or in an - * ancestor cgroup thereof) - */ -static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) -{ - if (current != task) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!cgroup_is_descendant(new_cgroup, current)) - return -EPERM; - } - - if (!cgroup_is_descendant(new_cgroup, task)) - return -EPERM; - - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (!cgroup_is_descendant(new_cgroup, c)) { - rcu_read_unlock(); - return -EPERM; - } - } - rcu_read_unlock(); - } - - return 0; -} - -/* - * Rules: you can only create a cgroup if - * 1. you are capable(CAP_SYS_ADMIN) - * 2. the target cgroup is a descendant of your own cgroup - */ -static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (!cgroup_is_descendant(cgroup, current)) - return ERR_PTR(-EPERM); - if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { - printk("ns_cgroup can't be created with parent " - "'clone_children' set.\n"); - return ERR_PTR(-EINVAL); - } - - printk_once("ns_cgroup deprecated: consider using the " - "'clone_children' flag without the ns_cgroup.\n"); - - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); - if (!ns_cgroup) - return ERR_PTR(-ENOMEM); - return &ns_cgroup->css; -} - -static void ns_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - ns_cgroup = cgroup_to_ns(cgroup); - kfree(ns_cgroup); -} - -struct cgroup_subsys ns_subsys = { - .name = "ns", - .can_attach = ns_can_attach, - .create = ns_create, - .destroy = ns_destroy, - .subsys_id = ns_subsys_id, -}; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5424e37673e..d6a00f3de15 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -201,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current, task_pid(current)); - if (err) - put_nsproxy(*new_nsp); - out: return err; } -- cgit v1.2.3-70-g09d2 From 3864601387cf4196371e3c1897fdffa5228296f9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 26 May 2011 16:25:46 -0700 Subject: mm: extract exe_file handling from procfs Setup and cleanup of mm_struct->exe_file is currently done in fs/proc/. This was because exe_file was needed only for /proc//exe. Since we will need the exe_file functionality also for core dumps (so core name can contain full binary path), built this functionality always into the kernel. To achieve that move that out of proc FS to the kernel/ where in fact it should belong. By doing that we can make dup_mm_exe_file static. Also we can drop linux/proc_fs.h inclusion in fs/exec.c and kernel/fork.c. Signed-off-by: Jiri Slaby Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - fs/proc/base.c | 51 ----------------------------------------------- include/linux/mm.h | 10 ++-------- include/linux/mm_types.h | 2 -- include/linux/proc_fs.h | 19 ------------------ kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 53 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 936f5776655..88a16c57228 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/proc/base.c b/fs/proc/base.c index dc8bca72b00..c2ac2fb123c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1576,57 +1576,6 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ - if (new_exe_file) - get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ - struct file *exe_file; - - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); - return exe_file; -} - -void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - static int proc_exe_link(struct inode *inode, struct path *exe_path) { struct task_struct *task; diff --git a/include/linux/mm.h b/include/linux/mm.h index fb8e814f78d..9670f71d7be 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1408,17 +1408,11 @@ extern void exit_mmap(struct mm_struct *); extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -#ifdef CONFIG_PROC_FS /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ extern void added_exe_file_vma(struct mm_struct *mm); extern void removed_exe_file_vma(struct mm_struct *mm); -#else -static inline void added_exe_file_vma(struct mm_struct *mm) -{} - -static inline void removed_exe_file_vma(struct mm_struct *mm) -{} -#endif /* CONFIG_PROC_FS */ +extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern int install_special_mapping(struct mm_struct *mm, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6fe96c19f85..2a78aae78c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -302,11 +302,9 @@ struct mm_struct { struct task_struct __rcu *owner; #endif -#ifdef CONFIG_PROC_FS /* store ref to file /proc//exe symlink points to */ struct file *exe_file; unsigned long num_exe_file_vmas; -#endif #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 648c9c58add..e7576cf9e32 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -173,12 +173,6 @@ extern void proc_net_remove(struct net *net, const char *name); extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, struct proc_dir_entry *parent); -/* While the {get|set|dup}_mm_exe_file functions are for mm_structs, they are - * only needed to implement /proc/|self/exe so we define them here. */ -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); -extern struct file *get_mm_exe_file(struct mm_struct *mm); -extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm); - extern struct file *proc_ns_fget(int fd); #else @@ -230,19 +224,6 @@ static inline void pid_ns_release_proc(struct pid_namespace *ns) { } -static inline void set_mm_exe_file(struct mm_struct *mm, - struct file *new_exe_file) -{} - -static inline struct file *get_mm_exe_file(struct mm_struct *mm) -{ - return NULL; -} - -static inline void dup_mm_exe_file(struct mm_struct *oldmm, - struct mm_struct *newmm) -{} - static inline struct file *proc_ns_fget(int fd) { return ERR_PTR(-EINVAL); diff --git a/kernel/fork.c b/kernel/fork.c index 1f84099ecce..ca406d91671 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -597,6 +596,57 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.2.3-70-g09d2 From 6f7bd76f05eb2bfbb48d58c0408a50a7e16b2423 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Thu, 26 May 2011 16:26:00 -0700 Subject: kernel/profile.c: remove some duplicate code from profile_hits() profile_hits() has a common check for prof_on and prof_buffer regardless of SMP or !SMP. So, remove some duplicate code by splitting profile_hits into two. [akpm@linux-foundation.org: make do_profile_hits static] Signed-off-by: Rakib Mullick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 14c9f87b9fc..961b389fe52 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -303,14 +303,12 @@ static void profile_discard_flip_buffers(void) mutex_unlock(&profile_flip_mutex); } -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; struct profile_hit *hits; - if (prof_on != type || !prof_buffer) - return; pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; @@ -417,16 +415,20 @@ out_free: #define profile_discard_flip_buffers() do { } while (0) #define profile_cpu_callback NULL -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; - - if (prof_on != type || !prof_buffer) - return; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ + +void profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + if (prof_on != type || !prof_buffer) + return; + do_profile_hits(type, __pc, nr_hits); +} EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) -- cgit v1.2.3-70-g09d2 From cd4ae6adf8b1c21d88e83ed56afeeef97b28f356 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Fri, 22 Apr 2011 18:53:54 +0800 Subject: sched: More sched_domain iterations fixes sched_domain iterations needs to be protected by rcu_read_lock() now, this patch adds another two places which needs the rcu lock, which is spotted by following suspicious rcu_dereference_check() usage warnings. kernel/sched_rt.c:1244 invoked rcu_dereference_check() without protection! kernel/sched_stats.h:41 invoked rcu_dereference_check() without protection! Signed-off-by: Xiaotian Feng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1303469634-11678-1-git-send-email-dfeng@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 10 ++++++++-- kernel/sched_stats.h | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 64b2a37c07d..88725c939e0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task) if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { int best_cpu; @@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task) * remote processor. */ if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); return this_cpu; + } best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); return best_cpu; + } } } + rcu_read_unlock(); /* * And finally, if there were no matches within the domains diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 48ddf431db0..331e01bcd02 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ - preempt_disable(); + rcu_read_lock(); for_each_domain(cpu, sd) { enum cpu_idle_type itype; @@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } - preempt_enable(); + rcu_read_unlock(); #endif } kfree(mask_str); -- cgit v1.2.3-70-g09d2 From d6aa8f85f16379d42c147b22b59e33b67f9ff466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 May 2011 14:21:33 +0200 Subject: sched: Fix ttwu() for __ARCH_WANT_INTERRUPTS_ON_CTXSW Marc reported that e4a52bcb9 (sched: Remove rq->lock from the first half of ttwu()) broke his ARM-SMP machine. Now ARM is one of the few __ARCH_WANT_INTERRUPTS_ON_CTXSW users, so that exception in the ttwu() code was suspect. Yong found that the interrupt could hit after context_switch() changes current but before it clears p->on_cpu, if that interrupt were to attempt a wake-up of p we would indeed find ourselves spinning in IRQ context. Fix this by reverting to the old behaviour for this situation and perform a full remote wake-up. Cc: Frank Rowand Cc: Yong Zhang Cc: Oleg Nesterov Reported-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5e43e9dc65d..a80ee911900 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) if (!next) smp_send_reschedule(cpu); } -#endif + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_cpu) { + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +#endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) { @@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* - * If called from interrupt context we could have landed in the - * middle of schedule(), in this case we should take care not - * to spin on ->on_cpu if p is current, since that would - * deadlock. + * In case the architecture enables interrupts in + * context_switch(), we cannot busy wait, since that + * would lead to deadlocks when an interrupt hits and + * tries to wake up @prev. So bail and do a complete + * remote wakeup. */ - if (p == current) { - ttwu_queue(p, cpu); + if (ttwu_activate_remote(p, wake_flags)) goto stat; - } -#endif +#else cpu_relax(); +#endif } /* * Pairs with the smp_wmb() in finish_lock_switch(). -- cgit v1.2.3-70-g09d2 From 1e876231785d82443a5ac8b6c660e9f51bc5dede Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 May 2011 16:21:10 -0700 Subject: sched: Fix ->min_vruntime calculation in dequeue_entity() Dima Zavin reported: "After pulling the thread off the run-queue during a cgroup change, the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime then gets normalized to this new value. This can then lead to the thread getting an unfair boost in the new group if the vruntime of the next task in the old run-queue was way further ahead." Reported-by: Dima Zavin Signed-off-by: John Stultz Recalls-having-tested-once-upon-a-time-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305674470-23727-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e32a9b70ee9..433491c2dc8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); } /* -- cgit v1.2.3-70-g09d2 From 1e1b6c511d1b23cb7c3b619d82fc7bd9f620565d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 19 May 2011 15:08:58 +0900 Subject: cpuset: Fix cpuset_cpus_allowed_fallback(), don't update tsk->rt.nr_cpus_allowed The rule is, we have to update tsk->rt.nr_cpus_allowed if we change tsk->cpus_allowed. Otherwise RT scheduler may confuse. Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4DD4B3FA.5060901@jp.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 2 +- include/linux/sched.h | 7 +++++++ kernel/cpuset.c | 4 ++-- kernel/kthread.c | 4 ++-- kernel/sched.c | 19 ++++++++++++------- 5 files changed, 24 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index f20eb8f1602..e9eaec52265 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -146,7 +146,7 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) { - cpumask_copy(&p->cpus_allowed, cpu_possible_mask); + do_set_cpus_allowed(p, cpu_possible_mask); return cpumask_any(cpu_active_mask); } diff --git a/include/linux/sched.h b/include/linux/sched.h index dc8871295a5..8da84b7bc1b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1841,9 +1841,16 @@ static inline void rcu_copy_process(struct task_struct *p) #endif #ifdef CONFIG_SMP +extern void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); #else +static inline void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask) +{ +} static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1ceeb049c82..9c9b7545c81 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2190,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); if (cs) - cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* @@ -2217,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * Like above we can temporary set any mask and rely on * set_cpus_allowed_ptr() as synchronization point. */ - cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); + do_set_cpus_allowed(tsk, cpu_possible_mask); cpu = cpumask_any(cpu_active_mask); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 3b34d2732bc..4ba7cccb499 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) return; } - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index a80ee911900..cbb3a0eee58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5860,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + do_set_cpus_allowed(idle, cpumask_of(cpu)); /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -5948,6 +5948,16 @@ static inline void sched_init_granularity(void) } #ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } +} + /* * This is how migration works: * @@ -5993,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) -- cgit v1.2.3-70-g09d2 From f506b3dc0ec454a16d40cab9ee5d75435b39dc50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 May 2011 17:02:53 +0200 Subject: perf: Fix SIGIO handling Vince noticed that unless we mmap() a buffer, SIGIO gets lost. So explicitly push the wakeup (including signals) when requested. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/n/tip-2euus3f3x3dyvdk52cjxw8zu@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c09767f7db3..d863b3c057b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, else perf_event_output(event, nmi, data, regs); + if (event->fasync && event->pending_kill) { + if (nmi) { + event->pending_wakeup = 1; + irq_work_queue(&event->pending); + } else + perf_event_wakeup(event); + } + return ret; } -- cgit v1.2.3-70-g09d2 From 8826f3b0397562eee6f8785d548be9dfdb169100 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 May 2011 05:41:41 -0700 Subject: rcu: Avoid acquiring rcu_node locks in timer functions This commit switches manipulations of the rcu_node ->wakemask field to atomic operations, which allows rcu_cpu_kthread_timer() to avoid acquiring the rcu_node lock. This should avoid the following lockdep splat reported by Valdis Kletnieks: [ 12.872150] usb 1-4: new high speed USB device number 3 using ehci_hcd [ 12.986667] usb 1-4: New USB device found, idVendor=413c, idProduct=2513 [ 12.986679] usb 1-4: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 12.987691] hub 1-4:1.0: USB hub found [ 12.987877] hub 1-4:1.0: 3 ports detected [ 12.996372] input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input10 [ 13.071471] udevadm used greatest stack depth: 3984 bytes left [ 13.172129] [ 13.172130] ======================================================= [ 13.172425] [ INFO: possible circular locking dependency detected ] [ 13.172650] 2.6.39-rc6-mmotm0506 #1 [ 13.172773] ------------------------------------------------------- [ 13.172997] blkid/267 is trying to acquire lock: [ 13.173009] (&p->pi_lock){-.-.-.}, at: [] try_to_wake_up+0x29/0x1aa [ 13.173009] [ 13.173009] but task is already holding lock: [ 13.173009] (rcu_node_level_0){..-...}, at: [] rcu_cpu_kthread_timer+0x27/0x58 [ 13.173009] [ 13.173009] which lock already depends on the new lock. [ 13.173009] [ 13.173009] [ 13.173009] the existing dependency chain (in reverse order) is: [ 13.173009] [ 13.173009] -> #2 (rcu_node_level_0){..-...}: [ 13.173009] [] check_prevs_add+0x8b/0x104 [ 13.173009] [] validate_chain+0x36f/0x3ab [ 13.173009] [] __lock_acquire+0x369/0x3e2 [ 13.173009] [] lock_acquire+0xfc/0x14c [ 13.173009] [] _raw_spin_lock+0x36/0x45 [ 13.173009] [] rcu_read_unlock_special+0x8c/0x1d5 [ 13.173009] [] __rcu_read_unlock+0x4f/0xd7 [ 13.173009] [] rcu_read_unlock+0x21/0x23 [ 13.173009] [] cpuacct_charge+0x6c/0x75 [ 13.173009] [] update_curr+0x101/0x12e [ 13.173009] [] check_preempt_wakeup+0xf7/0x23b [ 13.173009] [] check_preempt_curr+0x2b/0x68 [ 13.173009] [] ttwu_do_wakeup+0x76/0x128 [ 13.173009] [] ttwu_do_activate.constprop.63+0x57/0x5c [ 13.173009] [] scheduler_ipi+0x48/0x5d [ 13.173009] [] smp_reschedule_interrupt+0x16/0x18 [ 13.173009] [] reschedule_interrupt+0x13/0x20 [ 13.173009] [] rcu_read_unlock+0x21/0x23 [ 13.173009] [] find_get_page+0xa9/0xb9 [ 13.173009] [] filemap_fault+0x6a/0x34d [ 13.173009] [] __do_fault+0x54/0x3e6 [ 13.173009] [] handle_pte_fault+0x12c/0x1ed [ 13.173009] [] handle_mm_fault+0x1cd/0x1e0 [ 13.173009] [] do_page_fault+0x42d/0x5de [ 13.173009] [] page_fault+0x1f/0x30 [ 13.173009] [ 13.173009] -> #1 (&rq->lock){-.-.-.}: [ 13.173009] [] check_prevs_add+0x8b/0x104 [ 13.173009] [] validate_chain+0x36f/0x3ab [ 13.173009] [] __lock_acquire+0x369/0x3e2 [ 13.173009] [] lock_acquire+0xfc/0x14c [ 13.173009] [] _raw_spin_lock+0x36/0x45 [ 13.173009] [] __task_rq_lock+0x8b/0xd3 [ 13.173009] [] wake_up_new_task+0x41/0x108 [ 13.173009] [] do_fork+0x265/0x33f [ 13.173009] [] kernel_thread+0x6b/0x6d [ 13.173009] [] rest_init+0x21/0xd2 [ 13.173009] [] start_kernel+0x3bb/0x3c6 [ 13.173009] [] x86_64_start_reservations+0xaf/0xb3 [ 13.173009] [] x86_64_start_kernel+0xf0/0xf7 [ 13.173009] [ 13.173009] -> #0 (&p->pi_lock){-.-.-.}: [ 13.173009] [] check_prev_add+0x68/0x20e [ 13.173009] [] check_prevs_add+0x8b/0x104 [ 13.173009] [] validate_chain+0x36f/0x3ab [ 13.173009] [] __lock_acquire+0x369/0x3e2 [ 13.173009] [] lock_acquire+0xfc/0x14c [ 13.173009] [] _raw_spin_lock_irqsave+0x44/0x57 [ 13.173009] [] try_to_wake_up+0x29/0x1aa [ 13.173009] [] wake_up_process+0x10/0x12 [ 13.173009] [] rcu_cpu_kthread_timer+0x44/0x58 [ 13.173009] [] call_timer_fn+0xac/0x1e9 [ 13.173009] [] run_timer_softirq+0x1aa/0x1f2 [ 13.173009] [] __do_softirq+0x109/0x26a [ 13.173009] [] call_softirq+0x1c/0x30 [ 13.173009] [] do_softirq+0x44/0xf1 [ 13.173009] [] irq_exit+0x58/0xc8 [ 13.173009] [] smp_apic_timer_interrupt+0x79/0x87 [ 13.173009] [] apic_timer_interrupt+0x13/0x20 [ 13.173009] [] get_page_from_freelist+0x2aa/0x310 [ 13.173009] [] __alloc_pages_nodemask+0x178/0x243 [ 13.173009] [] pte_alloc_one+0x1e/0x3a [ 13.173009] [] __pte_alloc+0x22/0x14b [ 13.173009] [] handle_mm_fault+0x17e/0x1e0 [ 13.173009] [] do_page_fault+0x42d/0x5de [ 13.173009] [] page_fault+0x1f/0x30 [ 13.173009] [ 13.173009] other info that might help us debug this: [ 13.173009] [ 13.173009] Chain exists of: [ 13.173009] &p->pi_lock --> &rq->lock --> rcu_node_level_0 [ 13.173009] [ 13.173009] Possible unsafe locking scenario: [ 13.173009] [ 13.173009] CPU0 CPU1 [ 13.173009] ---- ---- [ 13.173009] lock(rcu_node_level_0); [ 13.173009] lock(&rq->lock); [ 13.173009] lock(rcu_node_level_0); [ 13.173009] lock(&p->pi_lock); [ 13.173009] [ 13.173009] *** DEADLOCK *** [ 13.173009] [ 13.173009] 3 locks held by blkid/267: [ 13.173009] #0: (&mm->mmap_sem){++++++}, at: [] do_page_fault+0x1f3/0x5de [ 13.173009] #1: (&yield_timer){+.-...}, at: [] call_timer_fn+0x0/0x1e9 [ 13.173009] #2: (rcu_node_level_0){..-...}, at: [] rcu_cpu_kthread_timer+0x27/0x58 [ 13.173009] [ 13.173009] stack backtrace: [ 13.173009] Pid: 267, comm: blkid Not tainted 2.6.39-rc6-mmotm0506 #1 [ 13.173009] Call Trace: [ 13.173009] [] print_circular_bug+0xc8/0xd9 [ 13.173009] [] check_prev_add+0x68/0x20e [ 13.173009] [] ? save_stack_trace+0x28/0x46 [ 13.173009] [] check_prevs_add+0x8b/0x104 [ 13.173009] [] validate_chain+0x36f/0x3ab [ 13.173009] [] __lock_acquire+0x369/0x3e2 [ 13.173009] [] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [] lock_acquire+0xfc/0x14c [ 13.173009] [] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [] _raw_spin_lock_irqsave+0x44/0x57 [ 13.173009] [] ? try_to_wake_up+0x29/0x1aa [ 13.173009] [] try_to_wake_up+0x29/0x1aa [ 13.173009] [] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [] wake_up_process+0x10/0x12 [ 13.173009] [] rcu_cpu_kthread_timer+0x44/0x58 [ 13.173009] [] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [] call_timer_fn+0xac/0x1e9 [ 13.173009] [] ? del_timer+0x75/0x75 [ 13.173009] [] ? rcu_check_quiescent_state+0x82/0x82 [ 13.173009] [] run_timer_softirq+0x1aa/0x1f2 [ 13.173009] [] __do_softirq+0x109/0x26a [ 13.173009] [] ? tick_dev_program_event+0x37/0xf6 [ 13.173009] [] ? time_hardirqs_off+0x1b/0x2f [ 13.173009] [] call_softirq+0x1c/0x30 [ 13.173009] [] do_softirq+0x44/0xf1 [ 13.173009] [] irq_exit+0x58/0xc8 [ 13.173009] [] smp_apic_timer_interrupt+0x79/0x87 [ 13.173009] [] apic_timer_interrupt+0x13/0x20 [ 13.173009] [] ? get_page_from_freelist+0x114/0x310 [ 13.173009] [] ? get_page_from_freelist+0x2aa/0x310 [ 13.173009] [] ? clear_page_c+0x7/0x10 [ 13.173009] [] ? prep_new_page+0x14c/0x1cd [ 13.173009] [] get_page_from_freelist+0x2aa/0x310 [ 13.173009] [] __alloc_pages_nodemask+0x178/0x243 [ 13.173009] [] ? __pmd_alloc+0x87/0x99 [ 13.173009] [] pte_alloc_one+0x1e/0x3a [ 13.173009] [] ? __pmd_alloc+0x87/0x99 [ 13.173009] [] __pte_alloc+0x22/0x14b [ 13.173009] [] handle_mm_fault+0x17e/0x1e0 [ 13.173009] [] do_page_fault+0x42d/0x5de [ 13.173009] [] ? sys_brk+0x32/0x10c [ 13.173009] [] ? time_hardirqs_off+0x1b/0x2f [ 13.173009] [] ? trace_hardirqs_off_caller+0x3f/0x9c [ 13.173009] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 13.173009] [] page_fault+0x1f/0x30 [ 14.010075] usb 5-1: new full speed USB device number 2 using uhci_hcd Reported-by: Valdis Kletnieks Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 13 +++++-------- kernel/rcutree.h | 4 +++- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8154a4a3491..5d96d68d20f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -1526,13 +1526,10 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt) */ static void rcu_cpu_kthread_timer(unsigned long arg) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->wakemask |= rdp->grpmask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + atomic_or(rdp->grpmask, &rnp->wakemask); invoke_rcu_node_kthread(rnp); } @@ -1680,11 +1677,11 @@ static int rcu_node_kthread(void *arg) for (;;) { rnp->node_kthread_status = RCU_KTHREAD_WAITING; - wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0); + wait_event_interruptible(rnp->node_wq, + atomic_read(&rnp->wakemask) != 0); rnp->node_kthread_status = RCU_KTHREAD_RUNNING; raw_spin_lock_irqsave(&rnp->lock, flags); - mask = rnp->wakemask; - rnp->wakemask = 0; + mask = atomic_xchg(&rnp->wakemask, 0); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { if ((mask & 0x1) == 0) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 93d4a1c2e88..561dcb9a8d2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -119,7 +119,9 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ - unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ + atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ + /* Since this has meaning only for leaf */ + /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ -- cgit v1.2.3-70-g09d2 From 08bca60a6912ad225254250c0a9c3a05b4152cfa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2011 16:06:29 -0700 Subject: rcu: Remove waitqueue usage for cpu, node, and boost kthreads It is not necessary to use waitqueues for the RCU kthreads because we always know exactly which thread is to be awakened. In addition, wake_up() only issues an actual wakeup when there is a thread waiting on the queue, which was why there was an extra explicit wake_up_process() to get the RCU kthreads started. Eliminating the waitqueues (and wake_up()) in favor of wake_up_process() eliminates the need for the initial wake_up_process() and also shrinks the data structure size a bit. The wakeup logic is placed in a new rcu_wait() macro. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 23 ++++++----------------- kernel/rcutree.h | 17 ++++++++++------- kernel/rcutree_plugin.h | 16 +--------------- 3 files changed, 17 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5d96d68d20f..05e254e930e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -95,7 +95,6 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; @@ -1476,7 +1475,7 @@ static void invoke_rcu_cpu_kthread(void) local_irq_restore(flags); return; } - wake_up(&__get_cpu_var(rcu_cpu_wq)); + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); local_irq_restore(flags); } @@ -1596,14 +1595,12 @@ static int rcu_cpu_kthread(void *arg) unsigned long flags; int spincnt = 0; unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); char work; char *workp = &per_cpu(rcu_cpu_has_work, cpu); for (;;) { *statusp = RCU_KTHREAD_WAITING; - wait_event_interruptible(*wqp, - *workp != 0 || kthread_should_stop()); + rcu_wait(*workp != 0 || kthread_should_stop()); local_bh_disable(); if (rcu_cpu_kthread_should_stop(cpu)) { local_bh_enable(); @@ -1654,7 +1651,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); return 0; @@ -1677,8 +1673,7 @@ static int rcu_node_kthread(void *arg) for (;;) { rnp->node_kthread_status = RCU_KTHREAD_WAITING; - wait_event_interruptible(rnp->node_wq, - atomic_read(&rnp->wakemask) != 0); + rcu_wait(atomic_read(&rnp->wakemask) != 0); rnp->node_kthread_status = RCU_KTHREAD_RUNNING; raw_spin_lock_irqsave(&rnp->lock, flags); mask = atomic_xchg(&rnp->wakemask, 0); @@ -1762,7 +1757,6 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, raw_spin_lock_irqsave(&rnp->lock, flags); rnp->node_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up_process(t); sp.sched_priority = 99; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } @@ -1779,21 +1773,16 @@ static int __init rcu_spawn_kthreads(void) rcu_kthreads_spawnable = 1; for_each_possible_cpu(cpu) { - init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu)); per_cpu(rcu_cpu_has_work, cpu) = 0; if (cpu_online(cpu)) (void)rcu_spawn_one_cpu_kthread(cpu); } rnp = rcu_get_root(rcu_state); - init_waitqueue_head(&rnp->node_wq); - rcu_init_boost_waitqueue(rnp); (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - if (NUM_RCU_NODES > 1) - rcu_for_each_leaf_node(rcu_state, rnp) { - init_waitqueue_head(&rnp->node_wq); - rcu_init_boost_waitqueue(rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } + } return 0; } early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 561dcb9a8d2..7b9a08b4aae 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -159,9 +159,6 @@ struct rcu_node { struct task_struct *boost_kthread_task; /* kthread that takes care of priority */ /* boosting for this rcu_node structure. */ - wait_queue_head_t boost_wq; - /* Wait queue on which to park the boost */ - /* kthread. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ unsigned long n_tasks_boosted; @@ -188,9 +185,6 @@ struct rcu_node { /* kthread that takes care of this rcu_node */ /* structure, for example, awakening the */ /* per-CPU kthreads as needed. */ - wait_queue_head_t node_wq; - /* Wait queue on which to park the per-node */ - /* kthread. */ unsigned int node_kthread_status; /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; @@ -336,6 +330,16 @@ struct rcu_data { /* scheduling clock irq */ /* before ratting on them. */ +#define rcu_wait(cond) \ +do { \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + schedule(); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) /* * RCU global state, including node hierarchy. This hierarchy is @@ -445,7 +449,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); -static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, cpumask_var_t cm); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ed339702481..049f2787a98 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1196,8 +1196,7 @@ static int rcu_boost_kthread(void *arg) for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; - wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || - rnp->exp_tasks); + rcu_wait(rnp->boost_tasks || rnp->exp_tasks); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1274,14 +1273,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; } -/* - * Initialize the RCU-boost waitqueue. - */ -static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) -{ - init_waitqueue_head(&rnp->boost_wq); -} - /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. @@ -1306,7 +1297,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, raw_spin_lock_irqsave(&rnp->lock, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up_process(t); sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); return 0; @@ -1328,10 +1318,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) -{ -} - static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp, int rnp_index) -- cgit v1.2.3-70-g09d2 From cc3ce5176d83cd8ae1134f86e208ea758d6cb78e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2011 13:42:06 -0700 Subject: rcu: Start RCU kthreads in TASK_INTERRUPTIBLE state Upon creation, kthreads are in TASK_UNINTERRUPTIBLE state, which can result in softlockup warnings. Because some of RCU's kthreads can legitimately be idle indefinitely, start them in TASK_INTERRUPTIBLE state in order to avoid those warnings. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 2 ++ kernel/rcutree_plugin.h | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 05e254e930e..77a7671dd14 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1648,6 +1648,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (IS_ERR(t)) return PTR_ERR(t); kthread_bind(t, cpu); + set_task_state(t, TASK_INTERRUPTIBLE); per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); per_cpu(rcu_cpu_kthread_task, cpu) = t; @@ -1755,6 +1756,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + set_task_state(t, TASK_INTERRUPTIBLE); rnp->node_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = 99; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 049f2787a98..a767b7dac36 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1295,6 +1295,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + set_task_state(t, TASK_INTERRUPTIBLE); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_KTHREAD_PRIO; -- cgit v1.2.3-70-g09d2 From 333c5ae9948194428fe6c5ef5c088304fc98263b Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 11 Feb 2011 12:49:04 -0800 Subject: idle governor: Avoid lock acquisition to read pm_qos before entering idle Thanks to the reviews and comments by Rafael, James, Mark and Andi. Here's version 2 of the patch incorporating your comments and also some update to my previous patch comments. I noticed that before entering idle state, the menu idle governor will look up the current pm_qos target value according to the list of qos requests received. This look up currently needs the acquisition of a lock to access the list of qos requests to find the qos target value, slowing down the entrance into idle state due to contention by multiple cpus to access this list. The contention is severe when there are a lot of cpus waking and going into idle. For example, for a simple workload that has 32 pair of processes ping ponging messages to each other, where 64 cpu cores are active in test system, I see the following profile with 37.82% of cpu cycles spent in contention of pm_qos_lock: - 37.82% swapper [kernel.kallsyms] [k] _raw_spin_lock_irqsave - _raw_spin_lock_irqsave - 95.65% pm_qos_request menu_select cpuidle_idle_call - cpu_idle 99.98% start_secondary A better approach will be to cache the updated pm_qos target value so reading it does not require lock acquisition as in the patch below. With this patch the contention for pm_qos_lock is removed and I saw a 2.2X increase in throughput for my message passing workload. cc: stable@kernel.org Signed-off-by: Tim Chen Acked-by: Andi Kleen Acked-by: James Bottomley Acked-by: mark gross Signed-off-by: Len Brown --- include/linux/pm_qos_params.h | 4 ++++ kernel/pm_qos_params.c | 37 +++++++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h index 77cbddb3784..a7d87f911ca 100644 --- a/include/linux/pm_qos_params.h +++ b/include/linux/pm_qos_params.h @@ -16,6 +16,10 @@ #define PM_QOS_NUM_CLASSES 4 #define PM_QOS_DEFAULT_VALUE -1 +#define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) +#define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) +#define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 + struct pm_qos_request_list { struct plist_node list; int pm_qos_class; diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index aeaa7f84682..6a8fad82a3a 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -53,11 +53,17 @@ enum pm_qos_type { PM_QOS_MIN /* return the smallest value */ }; +/* + * Note: The lockless read path depends on the CPU accessing + * target_value atomically. Atomic access is only guaranteed on all CPU + * types linux supports for 32 bit quantites + */ struct pm_qos_object { struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; + s32 target_value; /* Do not change to 64 bit */ s32 default_value; enum pm_qos_type type; }; @@ -70,7 +76,8 @@ static struct pm_qos_object cpu_dma_pm_qos = { .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", - .default_value = 2000 * USEC_PER_SEC, + .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, }; @@ -79,7 +86,8 @@ static struct pm_qos_object network_lat_pm_qos = { .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", - .default_value = 2000 * USEC_PER_SEC, + .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN }; @@ -89,7 +97,8 @@ static struct pm_qos_object network_throughput_pm_qos = { .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", - .default_value = 0, + .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, }; @@ -132,6 +141,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) } } +static inline s32 pm_qos_read_value(struct pm_qos_object *o) +{ + return o->target_value; +} + +static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +{ + o->target_value = value; +} + static void update_target(struct pm_qos_object *o, struct plist_node *node, int del, int value) { @@ -156,6 +175,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node, plist_add(node, &o->requests); } curr_value = pm_qos_get_value(o); + pm_qos_set_value(o, curr_value); spin_unlock_irqrestore(&pm_qos_lock, flags); if (prev_value != curr_value) @@ -190,18 +210,11 @@ static int find_pm_qos_object_by_minor(int minor) * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested * - * This function returns the current target value in an atomic manner. + * This function returns the current target value. */ int pm_qos_request(int pm_qos_class) { - unsigned long flags; - int value; - - spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[pm_qos_class]); - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return value; + return pm_qos_read_value(pm_qos_array[pm_qos_class]); } EXPORT_SYMBOL_GPL(pm_qos_request); -- cgit v1.2.3-70-g09d2 From 6345d24daf0c1fffe6642081d783cdf653ebaa5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 May 2011 11:32:28 -0700 Subject: mm: Fix boot crash in mm_alloc() Thomas Gleixner reports that we now have a boot crash triggered by CONFIG_CPUMASK_OFFSTACK=y: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] find_next_bit+0x55/0xb0 Call Trace: [] cpumask_any_but+0x2a/0x70 [] flush_tlb_mm+0x2b/0x80 [] pud_populate+0x35/0x50 [] pgd_alloc+0x9a/0xf0 [] mm_init+0xec/0x120 [] mm_alloc+0x53/0xd0 which was introduced by commit de03c72cfce5 ("mm: convert mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of mm_init() vs mm_init_cpumask Thomas wrote a patch to just fix the ordering of initialization, but I hate the new double allocation in the fork path, so I ended up instead doing some more radical surgery to clean it all up. Reported-by: Thomas Gleixner Reported-by: Ingo Molnar Cc: KOSAKI Motohiro Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 ++++++++++++-- include/linux/sched.h | 1 - init/main.c | 2 +- kernel/fork.c | 42 ++++++++++-------------------------------- 4 files changed, 23 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2a78aae78c6..027935c86c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,6 +264,8 @@ struct mm_struct { struct linux_binfmt *binfmt; + cpumask_var_t cpu_vm_mask_var; + /* Architecture-specific MM context */ mm_context_t context; @@ -311,10 +313,18 @@ struct mm_struct { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif - - cpumask_var_t cpu_vm_mask_var; +#ifdef CONFIG_CPUMASK_OFFSTACK + struct cpumask cpumask_allocation; +#endif }; +static inline void mm_init_cpumask(struct mm_struct *mm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + mm->cpu_vm_mask_var = &mm->cpumask_allocation; +#endif +} + /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { diff --git a/include/linux/sched.h b/include/linux/sched.h index bcddd013810..2a8621c4be1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2194,7 +2194,6 @@ static inline void mmdrop(struct mm_struct * mm) if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } -extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); diff --git a/init/main.c b/init/main.c index d2f1e086bf3..cafba67c13b 100644 --- a/init/main.c +++ b/init/main.c @@ -487,6 +487,7 @@ asmlinkage void __init start_kernel(void) printk(KERN_NOTICE "%s", linux_banner); setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); + mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); @@ -510,7 +511,6 @@ asmlinkage void __init start_kernel(void) sort_main_extable(); trap_init(); mm_init(); - BUG_ON(mm_init_cpumask(&init_mm, 0)); /* * Set up the scheduler prior starting any interrupts (such as the diff --git a/kernel/fork.c b/kernel/fork.c index ca406d91671..0276c30401a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) -{ -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) - return -ENOMEM; - - if (oldmm) - cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); - else - memset(mm_cpumask(mm), 0, cpumask_size()); -#endif - return 0; -} - static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); - if (!mm) - return NULL; - - if (mm_init_cpumask(mm, NULL)) { - mm_free_pgd(mm); - free_mm(mm); - return NULL; - } - - return mm; + mm_init_cpumask(mm); + return mm_init(mm, current); } /* @@ -559,7 +536,6 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); - free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -753,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); + mm_init_cpumask(mm); /* Initializing for Swap token stuff */ mm->token_priority = 0; @@ -765,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - if (mm_init_cpumask(mm, oldmm)) - goto fail_nocpumask; - if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -794,9 +768,6 @@ fail_nomem: return NULL; fail_nocontext: - free_cpumask_var(mm->cpu_vm_mask_var); - -fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() @@ -1591,6 +1562,13 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + /* + * FIXME! The "sizeof(struct mm_struct)" currently includes the + * whole struct cpumask for the OFFSTACK case. We could change + * this to *only* allocate as much of it as required by the + * maximum number of CPU's we can ever have. The cpumask_allocation + * is at the end of the structure, exactly for that reason. + */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); -- cgit v1.2.3-70-g09d2