From f36f8c75ae2e7d4da34f4c908cebdb4aa42c977e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2013 10:35:19 +0100 Subject: KEYS: Add per-user_namespace registers for persistent per-UID kerberos caches Add support for per-user_namespace registers of persistent per-UID kerberos caches held within the kernel. This allows the kerberos cache to be retained beyond the life of all a user's processes so that the user's cron jobs can work. The kerberos cache is envisioned as a keyring/key tree looking something like: struct user_namespace \___ .krb_cache keyring - The register \___ _krb.0 keyring - Root's Kerberos cache \___ _krb.5000 keyring - User 5000's Kerberos cache \___ _krb.5001 keyring - User 5001's Kerberos cache \___ tkt785 big_key - A ccache blob \___ tkt12345 big_key - Another ccache blob Or possibly: struct user_namespace \___ .krb_cache keyring - The register \___ _krb.0 keyring - Root's Kerberos cache \___ _krb.5000 keyring - User 5000's Kerberos cache \___ _krb.5001 keyring - User 5001's Kerberos cache \___ tkt785 keyring - A ccache \___ krbtgt/REDHAT.COM@REDHAT.COM big_key \___ http/REDHAT.COM@REDHAT.COM user \___ afs/REDHAT.COM@REDHAT.COM user \___ nfs/REDHAT.COM@REDHAT.COM user \___ krbtgt/KERNEL.ORG@KERNEL.ORG big_key \___ http/KERNEL.ORG@KERNEL.ORG big_key What goes into a particular Kerberos cache is entirely up to userspace. Kernel support is limited to giving you the Kerberos cache keyring that you want. The user asks for their Kerberos cache by: krb_cache = keyctl_get_krbcache(uid, dest_keyring); The uid is -1 or the user's own UID for the user's own cache or the uid of some other user's cache (requires CAP_SETUID). This permits rpc.gssd or whatever to mess with the cache. The cache returned is a keyring named "_krb." that the possessor can read, search, clear, invalidate, unlink from and add links to. Active LSMs get a chance to rule on whether the caller is permitted to make a link. Each uid's cache keyring is created when it first accessed and is given a timeout that is extended each time this function is called so that the keyring goes away after a while. The timeout is configurable by sysctl but defaults to three days. Each user_namespace struct gets a lazily-created keyring that serves as the register. The cache keyrings are added to it. This means that standard key search and garbage collection facilities are available. The user_namespace struct's register goes away when it does and anything left in it is then automatically gc'd. Signed-off-by: David Howells Tested-by: Simo Sorce cc: Serge E. Hallyn cc: Eric W. Biederman --- kernel/user.c | 4 ++++ kernel/user_namespace.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e6..a3a0dbfda32 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,10 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, +#ifdef CONFIG_KEYS_KERBEROS_CACHE + .krb_cache_register_sem = + __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba5..240fb62cf39 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,6 +101,9 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); +#ifdef CONFIG_PERSISTENT_KEYRINGS + init_rwsem(&ns->persistent_keyring_register_sem); +#endif return 0; } @@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); ns = parent; -- cgit v1.2.3-70-g09d2 From 9abc4e66eb839c28516916543768be08c814a3c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:15:10 +0100 Subject: KEYS: Rename public key parameter name arrays Rename the arrays of public key parameters (public key algorithm names, hash algorithm names and ID type names) so that the array name ends in "_name". Signed-off-by: David Howells Reviewed-by: Kees Cook Reviewed-by: Josh Boyer --- crypto/asymmetric_keys/public_key.c | 14 +++++++------- crypto/asymmetric_keys/x509_public_key.c | 8 ++++---- include/crypto/public_key.h | 6 +++--- kernel/module_signing.c | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index cb2e29180a8..b313df1bd25 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -22,13 +22,13 @@ MODULE_LICENSE("GPL"); -const char *const pkey_algo[PKEY_ALGO__LAST] = { +const char *const pkey_algo_name[PKEY_ALGO__LAST] = { [PKEY_ALGO_DSA] = "DSA", [PKEY_ALGO_RSA] = "RSA", }; -EXPORT_SYMBOL_GPL(pkey_algo); +EXPORT_SYMBOL_GPL(pkey_algo_name); -const char *const pkey_hash_algo[PKEY_HASH__LAST] = { +const char *const pkey_hash_algo_name[PKEY_HASH__LAST] = { [PKEY_HASH_MD4] = "md4", [PKEY_HASH_MD5] = "md5", [PKEY_HASH_SHA1] = "sha1", @@ -38,13 +38,13 @@ const char *const pkey_hash_algo[PKEY_HASH__LAST] = { [PKEY_HASH_SHA512] = "sha512", [PKEY_HASH_SHA224] = "sha224", }; -EXPORT_SYMBOL_GPL(pkey_hash_algo); +EXPORT_SYMBOL_GPL(pkey_hash_algo_name); -const char *const pkey_id_type[PKEY_ID_TYPE__LAST] = { +const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = { [PKEY_ID_PGP] = "PGP", [PKEY_ID_X509] = "X509", }; -EXPORT_SYMBOL_GPL(pkey_id_type); +EXPORT_SYMBOL_GPL(pkey_id_type_name); /* * Provide a part of a description of the key for /proc/keys. @@ -56,7 +56,7 @@ static void public_key_describe(const struct key *asymmetric_key, if (key) seq_printf(m, "%s.%s", - pkey_id_type[key->id_type], key->algo->name); + pkey_id_type_name[key->id_type], key->algo->name); } /* diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 06007f0e880..afbbc362f85 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -49,7 +49,7 @@ static int x509_check_signature(const struct public_key *pub, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[cert->sig_hash_algo], 0, 0); + tfm = crypto_alloc_shash(pkey_hash_algo_name[cert->sig_hash_algo], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); @@ -117,7 +117,7 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) pr_devel("Cert Issuer: %s\n", cert->issuer); pr_devel("Cert Subject: %s\n", cert->subject); - pr_devel("Cert Key Algo: %s\n", pkey_algo[cert->pkey_algo]); + pr_devel("Cert Key Algo: %s\n", pkey_algo_name[cert->pkey_algo]); pr_devel("Cert Valid From: %04ld-%02d-%02d %02d:%02d:%02d\n", cert->valid_from.tm_year + 1900, cert->valid_from.tm_mon + 1, cert->valid_from.tm_mday, cert->valid_from.tm_hour, @@ -127,8 +127,8 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) cert->valid_to.tm_mday, cert->valid_to.tm_hour, cert->valid_to.tm_min, cert->valid_to.tm_sec); pr_devel("Cert Signature: %s + %s\n", - pkey_algo[cert->sig_pkey_algo], - pkey_hash_algo[cert->sig_hash_algo]); + pkey_algo_name[cert->sig_pkey_algo], + pkey_hash_algo_name[cert->sig_hash_algo]); if (!cert->fingerprint || !cert->authority) { pr_warn("Cert for '%s' must have SubjKeyId and AuthKeyId extensions\n", diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index f5b0224c996..619d5706d83 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -22,7 +22,7 @@ enum pkey_algo { PKEY_ALGO__LAST }; -extern const char *const pkey_algo[PKEY_ALGO__LAST]; +extern const char *const pkey_algo_name[PKEY_ALGO__LAST]; enum pkey_hash_algo { PKEY_HASH_MD4, @@ -36,7 +36,7 @@ enum pkey_hash_algo { PKEY_HASH__LAST }; -extern const char *const pkey_hash_algo[PKEY_HASH__LAST]; +extern const char *const pkey_hash_algo_name[PKEY_HASH__LAST]; enum pkey_id_type { PKEY_ID_PGP, /* OpenPGP generated key ID */ @@ -44,7 +44,7 @@ enum pkey_id_type { PKEY_ID_TYPE__LAST }; -extern const char *const pkey_id_type[PKEY_ID_TYPE__LAST]; +extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST]; /* * Cryptographic data for the public-key subtype of the asymmetric key type. diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5e..ee476404167 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -54,7 +54,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(pkey_hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -217,7 +217,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !pkey_hash_algo[ms.hash]) + !pkey_hash_algo_name[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, -- cgit v1.2.3-70-g09d2 From f0e6d220a7cd93afa0260ac5e7849f00b05e035a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:07:13 +0100 Subject: KEYS: Load *.x509 files into kernel keyring Load all the files matching the pattern "*.x509" that are to be found in kernel base source dir and base build dir into the module signing keyring. The "extra_certificates" file is then redundant. Signed-off-by: David Howells --- kernel/Makefile | 35 +++++++++++++++++++++++++++++------ kernel/modsign_certificate.S | 3 +-- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb0..c34e5f993a2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,17 +142,40 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### # -# Pull the signing certificate and any extra certificates into the kernel +# Roll all the X.509 certificates that we can find together and pull +# them into the kernel. # +############################################################################### +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 +X509_CERTIFICATES := $(sort $(X509_CERTIFICATES-y)) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif + +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/modsign_certificate.o: $(obj)/x509_certificate_list -quiet_cmd_touch = TOUCH $@ - cmd_touch = touch $@ +quiet_cmd_x509certs = CERTS $@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list + $(call if_changed,x509certs) -extra_certificates: - $(call cmd,touch) +targets += $(obj)/.x509.list +$(obj)/.x509.list: + @echo $(X509_CERTIFICATES) >$@ -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +clean-files := x509_certificate_list .x509.list ############################################################################### # diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 4a9a86d12c8..6fe03c7ffe7 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S @@ -7,6 +7,5 @@ .section ".init.data","aw" GLOBAL(modsign_certificate_list) - .incbin "signing_key.x509" - .incbin "extra_certificates" + .incbin "kernel/x509_certificate_list" GLOBAL(modsign_certificate_list_end) -- cgit v1.2.3-70-g09d2 From 0fbd39cf7ffe3b6a787b66b672d21b84e4675352 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 17:13:15 +0100 Subject: KEYS: Have make canonicalise the paths of the X.509 certs better to deduplicate Have make canonicalise the paths of the X.509 certificates before we sort them as this allows $(sort) to better remove duplicates. Signed-off-by: David Howells --- kernel/Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c34e5f993a2..2c24195249d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -144,13 +144,19 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # -# Roll all the X.509 certificates that we can find together and pull -# them into the kernel. +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel. +# +# We look in the source root and the build root for all files whose name ends +# in ".x509". Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates. # ############################################################################### X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 -X509_CERTIFICATES := $(sort $(X509_CERTIFICATES-y)) +X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ + $(or $(realpath $(CERT)),$(CERT)))) ifeq ($(X509_CERTIFICATES),) $(warning *** No X.509 certificates found ***) -- cgit v1.2.3-70-g09d2 From b56e5a17b6b9acd16997960504b9940d0d7984e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:07:30 +0100 Subject: KEYS: Separate the kernel signature checking keyring from module signing Separate the kernel signature checking keyring from module signing so that it can be used by code other than the module-signing code. Signed-off-by: David Howells --- include/keys/system_keyring.h | 23 ++++++++++ init/Kconfig | 13 ++++++ kernel/Makefile | 15 ++++-- kernel/modsign_certificate.S | 11 ----- kernel/modsign_pubkey.c | 104 ------------------------------------------ kernel/module-internal.h | 2 - kernel/module_signing.c | 3 +- kernel/system_certificates.S | 12 +++++ kernel/system_keyring.c | 103 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 163 insertions(+), 123 deletions(-) create mode 100644 include/keys/system_keyring.h delete mode 100644 kernel/modsign_certificate.S delete mode 100644 kernel/modsign_pubkey.c create mode 100644 kernel/system_certificates.S create mode 100644 kernel/system_keyring.c (limited to 'kernel') diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h new file mode 100644 index 00000000000..8dabc399bd1 --- /dev/null +++ b/include/keys/system_keyring.h @@ -0,0 +1,23 @@ +/* System keyring containing trusted public keys. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _KEYS_SYSTEM_KEYRING_H +#define _KEYS_SYSTEM_KEYRING_H + +#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING + +#include + +extern struct key *system_trusted_keyring; + +#endif + +#endif /* _KEYS_SYSTEM_KEYRING_H */ diff --git a/init/Kconfig b/init/Kconfig index 3ecd8a1178f..0ff5407a837 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1668,6 +1668,18 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. + menuconfig MODULES bool "Enable loadable module support" option modules @@ -1741,6 +1753,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES + select SYSTEM_TRUSTED_KEYRING select KEYS select CRYPTO select ASYMMETRIC_KEY_TYPE diff --git a/kernel/Makefile b/kernel/Makefile index 2c24195249d..63136989c13 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,8 +54,9 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -141,11 +142,11 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into -# the kernel. +# the kernel so that they get loaded into the system trusted keyring during +# boot. # # We look in the source root and the build root for all files whose name ends # in ".x509". Unfortunately, this will generate duplicate filenames, so we @@ -153,6 +154,7 @@ ifeq ($(CONFIG_MODULE_SIG),y) # duplicates. # ############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ @@ -169,10 +171,11 @@ $(shell rm $(obj)/.x509.list) endif endif -kernel/modsign_certificate.o: $(obj)/x509_certificate_list +kernel/system_certificates.o: $(obj)/x509_certificate_list quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") + targets += $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list $(call if_changed,x509certs) @@ -182,7 +185,9 @@ $(obj)/.x509.list: @echo $(X509_CERTIFICATES) >$@ clean-files := x509_certificate_list .x509.list +endif +ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 6fe03c7ffe7..00000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - - .section ".init.data","aw" - -GLOBAL(modsign_certificate_list) - .incbin "kernel/x509_certificate_list" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e..00000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ - pr_notice("Initialise module verification\n"); - - modsign_keyring = keyring_alloc(".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(modsign_keyring)) - panic("Can't allocate module signing keyring\n"); - - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading module verification certificates\n"); - - end = modsign_certificate_list_end; - p = modsign_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(modsign_keyring, 1), - "asymmetric", - NULL, - p, - plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(key)) - pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - else - pr_notice("MODSIGN: Loaded cert '%s'\n", - key_ref_to_ptr(key)->description); - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d0..915e123a430 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@ * 2 of the Licence, or (at your option) any later version. */ -extern struct key *modsign_keyring; - extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index ee476404167..0b6b870dc5e 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "module-internal.h" /* @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(modsign_keyring, 1), + key = keyring_search(make_key_ref(system_trusted_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 00000000000..552d47b2d46 --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,12 @@ +#include +#include + +#define GLOBAL(name) \ + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): + + __INITRODATA + +GLOBAL(system_certificate_list) + .incbin "kernel/x509_certificate_list" +GLOBAL(system_certificate_list_end) diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 00000000000..51c35141a13 --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,103 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const u8 system_certificate_list_end[]; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ + pr_notice("Initialise system trusted keyring\n"); + + system_trusted_keyring = + keyring_alloc(".system_keyring", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(system_trusted_keyring)) + panic("Can't allocate system trusted keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading compiled-in X.509 certificates\n"); + + end = system_certificate_list_end; + p = system_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_system_certificate_list); -- cgit v1.2.3-70-g09d2 From 008643b86c5f33c115c84ccdda1725cac3ad50ad Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:07:37 +0100 Subject: KEYS: Add a 'trusted' flag and a 'trusted only' flag Add KEY_FLAG_TRUSTED to indicate that a key either comes from a trusted source or had a cryptographic signature chain that led back to a trusted key the kernel already possessed. Add KEY_FLAGS_TRUSTED_ONLY to indicate that a keyring will only accept links to keys marked with KEY_FLAGS_TRUSTED. Signed-off-by: David Howells Reviewed-by: Kees Cook --- include/linux/key-type.h | 1 + include/linux/key.h | 3 +++ kernel/system_keyring.c | 4 +++- security/keys/key.c | 8 ++++++++ security/keys/keyring.c | 4 ++++ 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/key-type.h b/include/linux/key-type.h index f58737bcb05..a74c3a84dfd 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,6 +45,7 @@ struct key_preparsed_payload { const void *data; /* Raw data */ size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ + bool trusted; /* True if key is trusted */ }; typedef int (*request_key_actor_t)(struct key_construction *key, diff --git a/include/linux/key.h b/include/linux/key.h index 010dbb618ac..80d677483e3 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -168,6 +168,8 @@ struct key { #define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ #define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ #define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ +#define KEY_FLAG_TRUSTED 8 /* set if key is trusted */ +#define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -218,6 +220,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ +#define KEY_ALLOC_TRUSTED 0x0004 /* Key should be flagged as trusted */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 51c35141a13..5296721eca5 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -40,6 +40,7 @@ static __init int system_trusted_keyring_init(void) if (IS_ERR(system_trusted_keyring)) panic("Can't allocate system trusted keyring\n"); + set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); return 0; } @@ -82,7 +83,8 @@ static __init int load_system_certificate_list(void) plen, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_TRUSTED); if (IS_ERR(key)) { pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", PTR_ERR(key)); diff --git a/security/keys/key.c b/security/keys/key.c index a819b5c7d4e..d331ea9ef38 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -300,6 +300,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; + if (flags & KEY_ALLOC_TRUSTED) + key->flags |= 1 << KEY_FLAG_TRUSTED; memset(&key->type_data, 0, sizeof(key->type_data)); @@ -813,6 +815,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, prep.data = payload; prep.datalen = plen; prep.quotalen = index_key.type->def_datalen; + prep.trusted = flags & KEY_ALLOC_TRUSTED; if (index_key.type->preparse) { ret = index_key.type->preparse(&prep); if (ret < 0) { @@ -827,6 +830,11 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, } index_key.desc_len = strlen(index_key.description); + key_ref = ERR_PTR(-EPERM); + if (!prep.trusted && test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags)) + goto error_free_prep; + flags |= prep.trusted ? KEY_ALLOC_TRUSTED : 0; + ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); diff --git a/security/keys/keyring.c b/security/keys/keyring.c index f7cdea22214..9b6f6e09b50 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1183,6 +1183,10 @@ int key_link(struct key *keyring, struct key *key) key_check(keyring); key_check(key); + if (test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags) && + !test_bit(KEY_FLAG_TRUSTED, &key->flags)) + return -EPERM; + ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret == 0) { kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage)); -- cgit v1.2.3-70-g09d2 From af34cb0c3d16b46d88b661692b885d1d998a8ecb Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 20 Aug 2013 14:36:26 -0400 Subject: KEYS: Make the system 'trusted' keyring viewable by userspace Give the root user the ability to read the system keyring and put read permission on the trusted keys added during boot. The latter is actually more theoretical than real for the moment as asymmetric keys do not currently provide a read operation. Signed-off-by: Mimi Zohar Signed-off-by: David Howells --- kernel/system_keyring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 5296721eca5..564dd93430a 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -35,7 +35,7 @@ static __init int system_trusted_keyring_init(void) keyring_alloc(".system_keyring", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(system_trusted_keyring)) panic("Can't allocate system trusted keyring\n"); @@ -81,8 +81,8 @@ static __init int load_system_certificate_list(void) NULL, p, plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_TRUSTED); if (IS_ERR(key)) { -- cgit v1.2.3-70-g09d2 From 3fe78ca2fb1d61ea598e63fcbf38aec76b36b3a8 Mon Sep 17 00:00:00 2001 From: Dmitry Kasatkin Date: Mon, 6 May 2013 15:58:15 +0300 Subject: keys: change asymmetric keys to use common hash definitions This patch makes use of the newly defined common hash algorithm info, replacing, for example, PKEY_HASH with HASH_ALGO. Changelog: - Lindent fixes - Mimi CC: David Howells Signed-off-by: Dmitry Kasatkin Signed-off-by: Mimi Zohar --- crypto/asymmetric_keys/Kconfig | 1 + crypto/asymmetric_keys/public_key.c | 12 ------------ crypto/asymmetric_keys/rsa.c | 14 +++++++------- crypto/asymmetric_keys/x509_cert_parser.c | 12 ++++++------ crypto/asymmetric_keys/x509_parser.h | 2 ++ crypto/asymmetric_keys/x509_public_key.c | 9 ++++----- include/crypto/public_key.h | 18 ++++-------------- kernel/module_signing.c | 8 ++++---- 8 files changed, 28 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 862b01fe617..82e7d6b0c27 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -13,6 +13,7 @@ config ASYMMETRIC_PUBLIC_KEY_SUBTYPE tristate "Asymmetric public-key crypto algorithm subtype" select MPILIB select PUBLIC_KEY_ALGO_RSA + select CRYPTO_HASH_INFO help This option provides support for asymmetric public key type handling. If signature generation and/or verification are to be used, diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 49ac8d848ed..97eb001960b 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -36,18 +36,6 @@ const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST] = { }; EXPORT_SYMBOL_GPL(pkey_algo); -const char *const pkey_hash_algo_name[PKEY_HASH__LAST] = { - [PKEY_HASH_MD4] = "md4", - [PKEY_HASH_MD5] = "md5", - [PKEY_HASH_SHA1] = "sha1", - [PKEY_HASH_RIPE_MD_160] = "rmd160", - [PKEY_HASH_SHA256] = "sha256", - [PKEY_HASH_SHA384] = "sha384", - [PKEY_HASH_SHA512] = "sha512", - [PKEY_HASH_SHA224] = "sha224", -}; -EXPORT_SYMBOL_GPL(pkey_hash_algo_name); - const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = { [PKEY_ID_PGP] = "PGP", [PKEY_ID_X509] = "X509", diff --git a/crypto/asymmetric_keys/rsa.c b/crypto/asymmetric_keys/rsa.c index 4a6a0696f8a..90a17f59ba2 100644 --- a/crypto/asymmetric_keys/rsa.c +++ b/crypto/asymmetric_keys/rsa.c @@ -73,13 +73,13 @@ static const struct { size_t size; } RSA_ASN1_templates[PKEY_HASH__LAST] = { #define _(X) { RSA_digest_info_##X, sizeof(RSA_digest_info_##X) } - [PKEY_HASH_MD5] = _(MD5), - [PKEY_HASH_SHA1] = _(SHA1), - [PKEY_HASH_RIPE_MD_160] = _(RIPE_MD_160), - [PKEY_HASH_SHA256] = _(SHA256), - [PKEY_HASH_SHA384] = _(SHA384), - [PKEY_HASH_SHA512] = _(SHA512), - [PKEY_HASH_SHA224] = _(SHA224), + [HASH_ALGO_MD5] = _(MD5), + [HASH_ALGO_SHA1] = _(SHA1), + [HASH_ALGO_RIPE_MD_160] = _(RIPE_MD_160), + [HASH_ALGO_SHA256] = _(SHA256), + [HASH_ALGO_SHA384] = _(SHA384), + [HASH_ALGO_SHA512] = _(SHA512), + [HASH_ALGO_SHA224] = _(SHA224), #undef _ }; diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 144201ccba0..29893162497 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -154,32 +154,32 @@ int x509_note_pkey_algo(void *context, size_t hdrlen, return -ENOPKG; /* Unsupported combination */ case OID_md4WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_MD5; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_MD5; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha1WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_SHA1; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA1; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha256WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_SHA256; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA256; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha384WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_SHA384; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA384; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha512WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_SHA512; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA512; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha224WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = PKEY_HASH_SHA224; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA224; ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; } diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h index 87d9cc26f63..04c81bd0f3f 100644 --- a/crypto/asymmetric_keys/x509_parser.h +++ b/crypto/asymmetric_keys/x509_parser.h @@ -21,6 +21,8 @@ struct x509_certificate { char *authority; /* Authority key fingerprint as hex */ struct tm valid_from; struct tm valid_to; + enum pkey_algo pkey_algo : 8; /* Public key algorithm */ + enum hash_algo sig_hash_algo : 8; /* Signature hash algorithm */ const void *tbs; /* Signed data */ unsigned tbs_size; /* Size of signed data */ unsigned raw_sig_size; /* Size of sigature */ diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 6abc27f2e8a..0a6bfad5491 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -96,7 +96,7 @@ int x509_get_sig_params(struct x509_certificate *cert) /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo_name[cert->sig.pkey_hash_algo], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[cert->sig.pkey_hash_algo], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); @@ -199,7 +199,7 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) cert->sig.pkey_hash_algo >= PKEY_HASH__LAST || !pkey_algo[cert->pub->pkey_algo] || !pkey_algo[cert->sig.pkey_algo] || - !pkey_hash_algo_name[cert->sig.pkey_hash_algo]) { + !hash_algo_name[cert->sig.pkey_hash_algo]) { ret = -ENOPKG; goto error_free_cert; } @@ -213,9 +213,8 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) cert->valid_to.tm_year + 1900, cert->valid_to.tm_mon + 1, cert->valid_to.tm_mday, cert->valid_to.tm_hour, cert->valid_to.tm_min, cert->valid_to.tm_sec); - pr_devel("Cert Signature: %s + %s\n", - pkey_algo_name[cert->sig.pkey_algo], - pkey_hash_algo_name[cert->sig.pkey_hash_algo]); + pr_devel("Cert Signature: %s\n", + hash_algo_name[cert->sig.pkey_hash_algo]); if (!cert->fingerprint) { pr_warn("Cert for '%s' must have a SubjKeyId extension\n", diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index b34fda4dcab..fc09732613a 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -15,6 +15,7 @@ #define _LINUX_PUBLIC_KEY_H #include +#include enum pkey_algo { PKEY_ALGO_DSA, @@ -25,19 +26,8 @@ enum pkey_algo { extern const char *const pkey_algo_name[PKEY_ALGO__LAST]; extern const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST]; -enum pkey_hash_algo { - PKEY_HASH_MD4, - PKEY_HASH_MD5, - PKEY_HASH_SHA1, - PKEY_HASH_RIPE_MD_160, - PKEY_HASH_SHA256, - PKEY_HASH_SHA384, - PKEY_HASH_SHA512, - PKEY_HASH_SHA224, - PKEY_HASH__LAST -}; - -extern const char *const pkey_hash_algo_name[PKEY_HASH__LAST]; +/* asymmetric key implementation supports only up to SHA224 */ +#define PKEY_HASH__LAST (HASH_ALGO_SHA224 + 1) enum pkey_id_type { PKEY_ID_PGP, /* OpenPGP generated key ID */ @@ -91,7 +81,7 @@ struct public_key_signature { u8 digest_size; /* Number of bytes in digest */ u8 nr_mpi; /* Occupancy of mpi[] */ enum pkey_algo pkey_algo : 8; - enum pkey_hash_algo pkey_hash_algo : 8; + enum hash_algo pkey_hash_algo : 8; union { MPI mpi[2]; struct { diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 0b6b870dc5e..be5b8fac4bd 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -29,7 +29,7 @@ */ struct module_signature { u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 hash; /* Digest algorithm [enum hash_algo] */ u8 id_type; /* Key identifier type [enum pkey_id_type] */ u8 signer_len; /* Length of signer's name */ u8 key_id_len; /* Length of key identifier */ @@ -40,7 +40,7 @@ struct module_signature { /* * Digest the module contents. */ -static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +static struct public_key_signature *mod_make_digest(enum hash_algo hash, const void *mod, unsigned long modlen) { @@ -55,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo_name[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -218,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !pkey_hash_algo_name[ms.hash]) + !hash_algo_name[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, -- cgit v1.2.3-70-g09d2 From 0b6b098efcddac2bf4e2a895c9b655560bbfcee4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 25 Oct 2013 12:14:15 +0200 Subject: padata: make the sequence counter an atomic_t Using a spinlock to atomically increase a counter sounds wrong -- we've atomic_t for this! Also move 'seq_nr' to a different cache line than 'lock' to reduce cache line trashing. This has the nice side effect of decreasing the size of struct parallel_data from 192 to 128 bytes for a x86-64 build, e.g. occupying only two instead of three cache lines. Those changes results in a 5% performance increase on an IPsec test run using pcrypt. Btw. the seq_lock spinlock was never explicitly initialized -- one more reason to get rid of it. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- include/linux/padata.h | 3 +-- kernel/padata.c | 9 ++++----- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 86292beebfe..43869465047 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -129,10 +129,9 @@ struct parallel_data { struct padata_serial_queue __percpu *squeue; atomic_t reorder_objects; atomic_t refcnt; + atomic_t seq_nr; struct padata_cpumask cpumask; spinlock_t lock ____cacheline_aligned; - spinlock_t seq_lock; - unsigned int seq_nr; unsigned int processed; struct timer_list timer; }; diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c95dcf..2abd25d79cc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) static int padata_cpu_hash(struct parallel_data *pd) { + unsigned int seq_nr; int cpu_index; /* @@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd) * seq_nr mod. number of cpus in use. */ - spin_lock(&pd->seq_lock); - cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); - pd->seq_nr++; - spin_unlock(&pd->seq_lock); + seq_nr = atomic_inc_return(&pd->seq_nr); + cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - pd->seq_nr = 0; + atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; -- cgit v1.2.3-70-g09d2 From 6ef4d2eaf5a46d4ab6db02612b5e883b834017b8 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 30 Oct 2013 09:11:28 +0800 Subject: kernel/system_certificate.S: use real contents instead of macro GLOBAL() If a macro is only used within 2 times, and also its contents are within 2 lines, recommend to expand it to shrink code line. For our case, the macro is not portable either: some architectures' assembler may use another character to mark newline in a macro (e.g. '`' for arc), which will cause issue. If still want to use macro and let it portable enough, it will also need include additional header file (e.g "#include ", although it also need be fixed). Signed-off-by: Chen Gang Signed-off-by: David Howells --- kernel/system_certificates.S | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S index 552d47b2d46..4aef390671c 100644 --- a/kernel/system_certificates.S +++ b/kernel/system_certificates.S @@ -1,12 +1,10 @@ #include #include -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - __INITRODATA -GLOBAL(system_certificate_list) + .globl VMLINUX_SYMBOL(system_certificate_list) +VMLINUX_SYMBOL(system_certificate_list): .incbin "kernel/x509_certificate_list" -GLOBAL(system_certificate_list_end) + .globl VMLINUX_SYMBOL(system_certificate_list_end) +VMLINUX_SYMBOL(system_certificate_list_end): -- cgit v1.2.3-70-g09d2 From b50eba7e2d534762a19a7207dda012f09302a8d2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 16 Sep 2013 18:20:42 -0400 Subject: audit: format user messages to size of MAX_AUDIT_MESSAGE_LENGTH Messages of type AUDIT_USER_TTY were being formatted to 1024 octets, truncating messages approaching MAX_AUDIT_MESSAGE_LENGTH (8970 octets). Set the formatting to 8560 characters, given maximum estimates for prefix and suffix budgets. See the problem discussion: https://www.redhat.com/archives/linux-audit/2009-January/msg00030.html And the new size rationale: https://www.redhat.com/archives/linux-audit/2013-September/msg00016.html Test ~8k messages with: auditctl -m "$(for i in $(seq -w 001 820);do echo -n "${i}0______";done)" Reported-by: LC Bruzenak Reported-by: Justin Stephenson Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/uapi/linux/audit.h | 6 ++++++ kernel/audit.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 75cef3fd97a..5dfcd85037e 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -358,6 +358,12 @@ enum { #define AUDIT_PERM_READ 4 #define AUDIT_PERM_ATTR 8 +/* MAX_AUDIT_MESSAGE_LENGTH is set in audit:lib/libaudit.h as: + * 8970 // PATH_MAX*2+CONTEXT_SIZE*2+11+256+1 + * max header+body+tailer: 44 + 29 + 32 + 262 + 7 + pad + */ +#define AUDIT_MESSAGE_TEXT_MAX 8560 + struct audit_status { __u32 mask; /* Bit mask for valid entries */ __u32 enabled; /* 1 = enabled, 0 = disabled */ diff --git a/kernel/audit.c b/kernel/audit.c index 91e53d04b6a..dd63d2f978d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -715,7 +715,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.1024s'", + audit_log_format(ab, " msg='%.*s'", + AUDIT_MESSAGE_TEXT_MAX, (char *)data); else { int size; -- cgit v1.2.3-70-g09d2 From 47145705e3e2894b6c73846b67734726c5d7173c Mon Sep 17 00:00:00 2001 From: "Ilya V. Matveychikov" Date: Sun, 29 Sep 2013 15:53:40 +0400 Subject: audit: remove duplicate inclusion of the netlink header Signed-off-by: Ilya V. Matveychikov Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index dd63d2f978d..97c1ab94a96 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@ #ifdef CONFIG_SECURITY #include #endif -#include #include #include #include -- cgit v1.2.3-70-g09d2 From b8f89caafeb55fba75b74bea25adc4e4cd91be67 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 18 Sep 2013 11:17:43 -0400 Subject: audit: remove newline accidentally added during session id helper refactor A newline was accidentally added during session ID helper refactorization in commit 4d3fb709. This needlessly uses up buffer space, messes up syslog formatting and makes userspace processing less efficient. Remove it. Signed-off-by: Richard Guy Briggs Acked-by: Eric Paris Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 97c1ab94a96..90699650ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1388,7 +1388,7 @@ void audit_log_session_info(struct audit_buffer *ab) u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); + audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) -- cgit v1.2.3-70-g09d2 From af0e493d304262162dcc0e0b39ee47b12461d003 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 23 Sep 2013 15:55:44 +0800 Subject: Audit: remove duplicate comments Remove it. Signed-off-by: Gao feng Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 90699650ee4..02b1b7875c9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1067,13 +1067,6 @@ static void wait_for_auditd(unsigned long sleep_time) remove_wait_queue(&audit_backlog_wait, &wait); } -/* Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, tsk - * should be NULL. */ - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) -- cgit v1.2.3-70-g09d2 From d48d805122e39c066898df2e460875d3aaf60508 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 15 Sep 2013 19:11:09 +0200 Subject: audit_alloc: clear TIF_SYSCALL_AUDIT if !audit_context If audit_filter_task() nacks the new thread it makes sense to clear TIF_SYSCALL_AUDIT which can be copied from parent by dup_task_struct(). A wrong TIF_SYSCALL_AUDIT is not really bad but it triggers the "slow" audit paths in entry.S to ensure the task can not miss audit_syscall_*() calls, this is pointless if the task has no ->audit_context. Signed-off-by: Oleg Nesterov Acked-by: Steve Grubb Acked-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9845cb32b60..95293abb877 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -943,8 +943,10 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) + if (state == AUDIT_DISABLED) { + clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); return 0; + } if (!(context = audit_alloc_context(state))) { kfree(key); -- cgit v1.2.3-70-g09d2 From 0868a5e150bc4c47e7a003367cd755811eb41e0b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 25 Jul 2013 18:02:55 -0700 Subject: audit: printk USER_AVC messages when audit isn't enabled When the audit=1 kernel parameter is absent and auditd is not running, AUDIT_USER_AVC messages are being silently discarded. AUDIT_USER_AVC messages should be sent to userspace using printk(), as mentioned in the commit message of 4a4cd633 ("AUDIT: Optimise the audit-disabled case for discarding user messages"). When audit_enabled is 0, audit_receive_msg() discards all user messages except for AUDIT_USER_AVC messages. However, audit_log_common_recv_msg() refuses to allocate an audit_buffer if audit_enabled is 0. The fix is to special case AUDIT_USER_AVC messages in both functions. It looks like commit 50397bd1 ("[AUDIT] clean up audit_receive_msg()") introduced this bug. Cc: # v2.6.25+ Signed-off-by: Tyler Hicks Cc: Al Viro Cc: Eric Paris Cc: linux-audit@redhat.com Acked-by: Kees Cook Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 02b1b7875c9..74550ff3644 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -612,7 +612,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); - if (!audit_enabled) { + if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return rc; } -- cgit v1.2.3-70-g09d2 From 42f74461a5b60cf6b42887e6d2ff5b7be4abf1ca Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 20 May 2013 15:08:18 -0400 Subject: audit: change decimal constant to macro for invalid uid SFR reported this 2013-05-15: > After merging the final tree, today's linux-next build (i386 defconfig) > produced this warning: > > kernel/auditfilter.c: In function 'audit_data_to_entry': > kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only > in ISO C90 [enabled by default] > > Introduced by commit 780a7654cee8 ("audit: Make testing for a valid > loginuid explicit") from Linus' tree. Replace this decimal constant in the code with a macro to make it more readable (add to the unsigned cast to quiet the warning). Cc: Stephen Rothwell Cc: "Eric W. Biederman" Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/uapi/linux/audit.h | 2 ++ kernel/auditfilter.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 5dfcd85037e..c1f0fced3ed 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -380,6 +380,8 @@ struct audit_tty_status { __u32 log_passwd; /* 1 = enabled, 0 = disabled */ }; +#define AUDIT_UID_UNSET (unsigned int)-1 + /* audit_rule_data supports filter rules with both integer and string * fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and * AUDIT_LIST_RULES requests. diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7aee8be7fb..8a344cebd8b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v1.2.3-70-g09d2 From b0fed40214ce79ef70d97584ebdf13f89786da0e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 22 May 2013 12:54:49 -0400 Subject: audit: implement generic feature setting and retrieving The audit_status structure was not designed with extensibility in mind. Define a new AUDIT_SET_FEATURE message type which takes a new structure of bits where things can be enabled/disabled/locked one at a time. This structure should be able to grow in the future while maintaining forward and backward compatibility (based loosly on the ideas from capabilities and prctl) This does not actually add any features, but is just infrastructure to allow new on/off types of audit system features. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/linux/audit.h | 2 + include/uapi/linux/audit.h | 16 +++++++ kernel/audit.c | 109 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 729a4d165bc..7b31bec9bcc 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -73,6 +73,8 @@ struct audit_field { void *lsm_rule; }; +extern int is_audit_feature_set(int which); + extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c1f0fced3ed..9eddf2ca614 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -68,6 +68,9 @@ #define AUDIT_MAKE_EQUIV 1015 /* Append to watched tree */ #define AUDIT_TTY_GET 1016 /* Get TTY auditing status */ #define AUDIT_TTY_SET 1017 /* Set TTY auditing status */ +#define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */ +#define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */ +#define AUDIT_FEATURE_CHANGE 1020 /* audit log listing feature changes */ #define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly uninteresting to kernel */ #define AUDIT_USER_AVC 1107 /* We filter this differently */ @@ -375,6 +378,19 @@ struct audit_status { __u32 backlog; /* messages waiting in queue */ }; +struct audit_features { +#define AUDIT_FEATURE_VERSION 1 + __u32 vers; + __u32 mask; /* which bits we are dealing with */ + __u32 features; /* which feature to enable/disable */ + __u32 lock; /* which features to lock */ +}; + +#define AUDIT_LAST_FEATURE -1 + +#define audit_feature_valid(x) ((x) >= 0 && (x) <= AUDIT_LAST_FEATURE) +#define AUDIT_FEATURE_TO_MASK(x) (1 << ((x) & 31)) /* mask for __u32 */ + struct audit_tty_status { __u32 enabled; /* 1 = enabled, 0 = disabled */ __u32 log_passwd; /* 1 = enabled, 0 = disabled */ diff --git a/kernel/audit.c b/kernel/audit.c index 74550ff3644..29ee6a421c6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -139,6 +139,15 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); +static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, + .mask = -1, + .features = 0, + .lock = 0,}; + +static char *audit_feature_names[0] = { +}; + + /* Serialize requests from userspace. */ DEFINE_MUTEX(audit_cmd_mutex); @@ -583,6 +592,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: + case AUDIT_GET_FEATURE: + case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -627,6 +638,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return rc; } +int is_audit_feature_set(int i) +{ + return af.features & AUDIT_FEATURE_TO_MASK(i); +} + + +static int audit_get_feature(struct sk_buff *skb) +{ + u32 seq; + + seq = nlmsg_hdr(skb)->nlmsg_seq; + + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, + &af, sizeof(af)); + + return 0; +} + +static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, + u32 old_lock, u32 new_lock, int res) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", + audit_feature_names[which], !!old_feature, !!new_feature, + !!old_lock, !!new_lock, res); + audit_log_end(ab); +} + +static int audit_set_feature(struct sk_buff *skb) +{ + struct audit_features *uaf; + int i; + + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + uaf = nlmsg_data(nlmsg_hdr(skb)); + + /* if there is ever a version 2 we should handle that here */ + + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + new_lock = (uaf->lock | af.lock) & feature; + old_lock = af.lock & feature; + + /* are we changing a locked feature? */ + if ((af.lock & feature) && (new_feature != old_feature)) { + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 0); + return -EPERM; + } + } + /* nothing invalid, do the changes */ + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + old_lock = af.lock & feature; + new_lock = (uaf->lock | af.lock) & feature; + + if (new_feature != old_feature) + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 1); + + if (new_feature) + af.features |= feature; + else + af.features &= ~feature; + af.lock |= new_lock; + } + + return 0; +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -698,6 +797,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit); break; + case AUDIT_GET_FEATURE: + err = audit_get_feature(skb); + if (err) + return err; + break; + case AUDIT_SET_FEATURE: + err = audit_set_feature(skb); + if (err) + return err; + break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: -- cgit v1.2.3-70-g09d2 From da0a610497ce193782c8df4a33fee7fce030cb99 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 08:58:31 -0400 Subject: audit: loginuid functions coding style This is just a code rework. It makes things more readable. It does not make any functional changes. It does change the log messages to include both the old session id as well the new and it includes a new res field, which means we get messages even when the user did not have permission to change the loginuid. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 70 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 95293abb877..72684679e8b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1966,6 +1966,39 @@ int auditsc_get_stamp(struct audit_context *ctx, /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); +static int audit_set_loginuid_perm(kuid_t loginuid) +{ +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (capable(CAP_AUDIT_CONTROL)) + return 0; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + return -EPERM; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, unsigned int sessionid, + int rc) +{ + struct audit_buffer *ab; + uid_t uid, ologinuid, nloginuid; + + uid = from_kuid(&init_user_ns, task_uid(current)); + ologinuid = from_kuid(&init_user_ns, koldloginuid); + nloginuid = from_kuid(&init_user_ns, kloginuid), + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " + "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, + nloginuid, oldsessionid, sessionid, !rc); + audit_log_end(ab); +} + /** * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value @@ -1977,37 +2010,24 @@ static atomic_t session_id = ATOMIC_INIT(0); int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - struct audit_context *context = task->audit_context; - unsigned int sessionid; + unsigned int sessionid = -1; + kuid_t oldloginuid, oldsessionid; + int rc; -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (audit_loginuid_set(task)) - return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); + + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; sessionid = atomic_inc_return(&session_id); - if (context && context->in_syscall) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u" - " old ses=%u new ses=%u", - task->pid, - from_kuid(&init_user_ns, task_uid(task)), - from_kuid(&init_user_ns, task->loginuid), - from_kuid(&init_user_ns, loginuid), - task->sessionid, sessionid); - audit_log_end(ab); - } - } task->sessionid = sessionid; task->loginuid = loginuid; - return 0; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; } /** -- cgit v1.2.3-70-g09d2 From 83fa6bbe4c4541ae748b550b4ec391f8a0acfe94 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 09:39:29 -0400 Subject: audit: remove CONFIG_AUDIT_LOGINUID_IMMUTABLE After trying to use this feature in Fedora we found the hard coding policy like this into the kernel was a bad idea. Surprise surprise. We ran into these problems because it was impossible to launch a container as a logged in user and run a login daemon inside that container. This reverts back to the old behavior before this option was added. The option will be re-added in a userspace selectable manor such that userspace can choose when it is and when it is not appropriate. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- init/Kconfig | 14 -------------- kernel/auditsc.c | 10 ++++------ 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index fed81b576f2..18a98c893d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -301,20 +301,6 @@ config AUDIT_TREE depends on AUDITSYSCALL select FSNOTIFY -config AUDIT_LOGINUID_IMMUTABLE - bool "Make audit loginuid immutable" - depends on AUDIT - help - The config option toggles if a task setting its loginuid requires - CAP_SYS_AUDITCONTROL or if that task should require no special permissions - but should instead only allow setting its loginuid if it was never - previously set. On systems which use systemd or a similar central - process to restart login services this should be set to true. On older - systems in which an admin would typically have to directly stop and - start processes this should be set to false. Setting this to true allows - one to drop potentially dangerous capabilites from the login tasks, - but may not be backwards compatible with older init systems. - source "kernel/irq/Kconfig" source "kernel/time/Kconfig" diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 72684679e8b..b55788bf160 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1968,15 +1968,13 @@ static atomic_t session_id = ATOMIC_INIT(0); static int audit_set_loginuid_perm(kuid_t loginuid) { -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (capable(CAP_AUDIT_CONTROL)) - return 0; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + return 0; } static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, -- cgit v1.2.3-70-g09d2 From 81407c84ace88368ff23abb81caaeacf050c8450 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 09:49:14 -0400 Subject: audit: allow unsetting the loginuid (with priv) If a task has CAP_AUDIT_CONTROL allow that task to unset their loginuid. This would allow a child of that task to set their loginuid without CAP_AUDIT_CONTROL. Thus when launching a new login daemon, a priviledged helper would be able to unset the loginuid and then the daemon, which may be malicious user facing, do not need priv to function correctly. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- fs/proc/base.c | 14 ++++++++++---- kernel/auditsc.c | 4 +++- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1485e38daaa..03c8d747be4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } } length = audit_set_loginuid(kloginuid); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b55788bf160..c75d7813aef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2019,7 +2019,9 @@ int audit_set_loginuid(kuid_t loginuid) if (rc) goto out; - sessionid = atomic_inc_return(&session_id); + /* are we setting or clearing? */ + if (uid_valid(loginuid)) + sessionid = atomic_inc_return(&session_id); task->sessionid = sessionid; task->loginuid = loginuid; -- cgit v1.2.3-70-g09d2 From d040e5af380554c23ffe0a034ae5f3e53da93a1d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 09:18:04 -0400 Subject: audit: audit feature to only allow unsetting the loginuid This is a new audit feature which only grants processes with CAP_AUDIT_CONTROL the ability to unset their loginuid. They cannot directly set it from a valid uid to another valid uid. The ability to unset the loginuid is nice because a priviledged task, like that of container creation, can unset the loginuid and then priv is not needed inside the container when a login daemon needs to set the loginuid. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/uapi/linux/audit.h | 3 ++- kernel/audit.c | 3 ++- kernel/auditsc.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9eddf2ca614..05e5e8fc2ac 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -386,7 +386,8 @@ struct audit_features { __u32 lock; /* which features to lock */ }; -#define AUDIT_LAST_FEATURE -1 +#define AUDIT_FEATURE_ONLY_UNSET_LOGINUID 0 +#define AUDIT_LAST_FEATURE AUDIT_FEATURE_ONLY_UNSET_LOGINUID #define audit_feature_valid(x) ((x) >= 0 && (x) <= AUDIT_LAST_FEATURE) #define AUDIT_FEATURE_TO_MASK(x) (1 << ((x) & 31)) /* mask for __u32 */ diff --git a/kernel/audit.c b/kernel/audit.c index 29ee6a421c6..fbfa3a74dec 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -144,7 +144,8 @@ static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .features = 0, .lock = 0,}; -static char *audit_feature_names[0] = { +static char *audit_feature_names[1] = { + "only_unset_loginuid", }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c75d7813aef..924c0bf048d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1974,6 +1974,9 @@ static int audit_set_loginuid_perm(kuid_t loginuid) /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) + return -EPERM; return 0; } -- cgit v1.2.3-70-g09d2 From 21b85c31d23f2047d47e1f74bfa5caa8b75c1c77 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 23 May 2013 14:26:00 -0400 Subject: audit: audit feature to set loginuid immutable This adds a new 'audit_feature' bit which allows userspace to set it such that the loginuid is absolutely immutable, even if you have CAP_AUDIT_CONTROL. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/uapi/linux/audit.h | 3 ++- kernel/audit.c | 3 ++- kernel/auditsc.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 05e5e8fc2ac..e2f0d997713 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -387,7 +387,8 @@ struct audit_features { }; #define AUDIT_FEATURE_ONLY_UNSET_LOGINUID 0 -#define AUDIT_LAST_FEATURE AUDIT_FEATURE_ONLY_UNSET_LOGINUID +#define AUDIT_FEATURE_LOGINUID_IMMUTABLE 1 +#define AUDIT_LAST_FEATURE AUDIT_FEATURE_LOGINUID_IMMUTABLE #define audit_feature_valid(x) ((x) >= 0 && (x) <= AUDIT_LAST_FEATURE) #define AUDIT_FEATURE_TO_MASK(x) (1 << ((x) & 31)) /* mask for __u32 */ diff --git a/kernel/audit.c b/kernel/audit.c index fbfa3a74dec..f3f36f5eb4a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -144,8 +144,9 @@ static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, .features = 0, .lock = 0,}; -static char *audit_feature_names[1] = { +static char *audit_feature_names[2] = { "only_unset_loginuid", + "loginuid_immutable", }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 924c0bf048d..63223d671a6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1971,6 +1971,9 @@ static int audit_set_loginuid_perm(kuid_t loginuid) /* if we are unset, we don't need privs */ if (!audit_loginuid_set(current)) return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; /* it is set, you need permission */ if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; -- cgit v1.2.3-70-g09d2 From db510fc5cd9b9db214d7ec1828662942fac19c8c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 4 Jul 2013 12:56:11 -0400 Subject: audit: update AUDIT_INODE filter rule to comparator function It appears this one comparison function got missed in f368c07d (and 9c937dcc). Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 63223d671a6..065c7a14935 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -566,7 +566,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = (name->ino == f->val); + result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { -- cgit v1.2.3-70-g09d2 From 64fbff9ae0a0a843365d922e0057fc785f23f0e3 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:04:24 +0200 Subject: audit: fix info leak in AUDIT_GET requests We leak 4 bytes of kernel stack in response to an AUDIT_GET request as we miss to initialize the mask member of status_set. Fix that. Cc: Al Viro Cc: Eric Paris Cc: stable@vger.kernel.org # v2.6.6+ Signed-off-by: Mathias Krause Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f3f36f5eb4a..e0f7767e4a3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -759,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) switch (msg_type) { case AUDIT_GET: + status_set.mask = 0; status_set.enabled = audit_enabled; status_set.failure = audit_failure; status_set.pid = audit_pid; -- cgit v1.2.3-70-g09d2 From e13f91e3c57986a609c10ddf94af0546a2a97dce Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 5 Nov 2013 10:48:02 -0500 Subject: audit: use memset instead of trying to initialize field by field We currently are setting fields to 0 to initialize the structure declared on the stack. This is a bad idea as if the structure has holes or unpacked space these will not be initialized. Just use memset. This is not a performance critical section of code. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e0f7767e4a3..3ce60e063aa 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -759,7 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) switch (msg_type) { case AUDIT_GET: - status_set.mask = 0; + memset(&status_set, 0, sizeof(status_set)); status_set.enabled = audit_enabled; status_set.failure = audit_failure; status_set.pid = audit_pid; -- cgit v1.2.3-70-g09d2 From 4d8fe7376a12bf4524783dd95cbc00f1fece6232 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:04:25 +0200 Subject: audit: use nlmsg_len() to get message payload length Using the nlmsg_len member of the netlink header to test if the message is valid is wrong as it includes the size of the netlink header itself. Thereby allowing to send short netlink messages that pass those checks. Use nlmsg_len() instead to test for the right message length. The result of nlmsg_len() is guaranteed to be non-negative as the netlink message already passed the checks of nlmsg_ok(). Also switch to min_t() to please checkpatch.pl. Cc: Al Viro Cc: Eric Paris Cc: stable@vger.kernel.org # v2.6.6+ for the 1st hunk, v2.6.23+ for the 2nd Signed-off-by: Mathias Krause Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3ce60e063aa..3ad14e6e00b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -771,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) &status_set, sizeof(status_set)); break; case AUDIT_SET: - if (nlh->nlmsg_len < sizeof(struct audit_status)) + if (nlmsg_len(nlh) < sizeof(struct audit_status)) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { @@ -944,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; -- cgit v1.2.3-70-g09d2 From b95d77fe341b9c9641addb326cf43c30d1ba23b8 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 3 May 2013 14:03:49 -0400 Subject: audit: use given values in tty_audit enable api In send/GET, we don't want the kernel to lie about what value is set. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3ad14e6e00b..616164038ad 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -930,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk = current; spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty != 0; + s.enabled = tsk->signal->audit_tty; s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); -- cgit v1.2.3-70-g09d2 From d3aea84a4ace5ff9ce7fb7714cee07bebef681c2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 May 2013 10:32:23 -0400 Subject: audit: log the audit_names record type ...to make it clear what the intent behind each record's operation was. In many cases you can infer this, based on the context of the syscall and the result. In other cases it's not so obvious. For instance, in the case where you have a file being renamed over another, you'll have two different records with the same filename but different inode info. By logging this information we can clearly tell which one was created and which was deleted. This fixes what was broken in commit bfcec708. Commit 79f6530c should also be backported to stable v3.7+. Signed-off-by: Jeff Layton Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 616164038ad..b8831ac25b7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1640,6 +1640,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } } + /* log the audit_names record type */ + audit_log_format(ab, " nametype="); + switch(n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, "NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, "PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, "DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, "CREATE"); + break; + default: + audit_log_format(ab, "UNKNOWN"); + break; + } + audit_log_fcaps(ab, n); audit_log_end(ab); } -- cgit v1.2.3-70-g09d2 From 78122037b7e8febbd3116ab0da3ee6c34756bde9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 4 Sep 2013 15:01:43 -0400 Subject: audit: do not reject all AUDIT_INODE filter types commit ab61d38ed8cf670946d12dc46b9198b521c790ea tried to merge the invalid filter checking into a single function. However AUDIT_INODE filters were not verified in the new generic checker. Thus such rules were being denied even though they were perfectly valid. Ex: $ auditctl -a exit,always -F arch=b64 -S open -F key=/foo -F inode=6955 -F devmajor=9 -F devminor=1 Error sending add rule data request (Invalid argument) Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8a344cebd8b..51f3fd4c1ed 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: + case AUDIT_INODE: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; -- cgit v1.2.3-70-g09d2 From bd131fb1aa5e4cd879f89aef30f4f7cde6d4b409 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 19 Mar 2013 00:09:40 -0700 Subject: audit: Kill the unused struct audit_aux_data_capset Signed-off-by: "Eric W. Biederman" (cherry picked from ebiederman commit 6904431d6b41190e42d6b94430b67cb4e7e6a4b7) Signed-off-by: Eric Paris --- kernel/auditsc.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 065c7a14935..c7b97aa70c6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -121,12 +121,6 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; -struct audit_aux_data_capset { - struct audit_aux_data d; - pid_t pid; - struct audit_cap_data cap; -}; - struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; -- cgit v1.2.3-70-g09d2 From 9462dc59817580419ef1f2504e32f861c290f251 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 23 Oct 2013 16:55:38 -0400 Subject: audit: remove unused envc member of audit_aux_data_execve Get rid of write-only audit_aux_data_exeve structure member envc. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c7b97aa70c6..11078f32d13 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -98,7 +98,6 @@ struct audit_aux_data { struct audit_aux_data_execve { struct audit_aux_data d; int argc; - int envc; struct mm_struct *mm; }; @@ -2158,7 +2157,6 @@ int __audit_bprm(struct linux_binprm *bprm) return -ENOMEM; ax->argc = bprm->argc; - ax->envc = bprm->envc; ax->mm = bprm->mm; ax->d.type = AUDIT_EXECVE; ax->d.next = context->aux; -- cgit v1.2.3-70-g09d2 From d9cfea91e97d5d19f9d69beaa844f5fe56a6adc6 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 30 Oct 2013 17:56:13 -0400 Subject: audit: move audit_aux_data_execve contents into audit_context union audit_bprm() was being called to add an AUDIT_EXECVE record to the audit context every time search_binary_handler() was recursively called. Only one reference is necessary, so just update it. Move the the contents of audit_aux_data_execve into the union in audit_context, removing dependence on a kmalloc along the way. Reported-by: Oleg Nesterov Cc: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- include/linux/audit.h | 4 ++-- kernel/audit.h | 4 ++++ kernel/auditsc.c | 41 ++++++++++++----------------------------- 3 files changed, 18 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 7b31bec9bcc..08b38bf13eb 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -209,7 +209,7 @@ static inline int audit_get_sessionid(struct task_struct *tsk) extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); -extern int __audit_bprm(struct linux_binprm *bprm); +extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); @@ -241,7 +241,7 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid static inline int audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) - return __audit_bprm(bprm); + __audit_bprm(bprm); return 0; } static inline int audit_socketcall(int nargs, unsigned long *args) diff --git a/kernel/audit.h b/kernel/audit.h index 123c9b7c397..e7b94ab66c4 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -197,6 +197,10 @@ struct audit_context { int fd; int flags; } mmap; + struct { + int argc; + struct mm_struct *mm; + } execve; }; int fds[2]; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 11078f32d13..425a8939be1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -95,12 +95,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_execve { - struct audit_aux_data d; - int argc; - struct mm_struct *mm; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -1144,20 +1138,19 @@ static int audit_log_single_execve_arg(struct audit_context *context, } static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab, - struct audit_aux_data_execve *axi) + struct audit_buffer **ab) { int i, len; size_t len_sent = 0; const char __user *p; char *buf; - if (axi->mm != current->mm) + if (context->execve.mm != current->mm) return; /* execve failed, no additional info */ - p = (const char __user *)axi->mm->arg_start; + p = (const char __user *)current->mm->arg_start; - audit_log_format(*ab, "argc=%d", axi->argc); + audit_log_format(*ab, "argc=%d", context->execve.argc); /* * we need some kernel buffer to hold the userspace args. Just @@ -1171,7 +1164,7 @@ static void audit_log_execve_info(struct audit_context *context, return; } - for (i = 0; i < axi->argc; i++) { + for (i = 0; i < context->execve.argc; i++) { len = audit_log_single_execve_arg(context, ab, i, &len_sent, p, buf); if (len <= 0) @@ -1274,6 +1267,9 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; } + case AUDIT_EXECVE: { + audit_log_execve_info(context, &ab); + break; } } audit_log_end(ab); } @@ -1320,11 +1316,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts switch (aux->type) { - case AUDIT_EXECVE: { - struct audit_aux_data_execve *axi = (void *)aux; - audit_log_execve_info(context, &ab, axi); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -2147,21 +2138,13 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int __audit_bprm(struct linux_binprm *bprm) +void __audit_bprm(struct linux_binprm *bprm) { - struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->argc = bprm->argc; - ax->mm = bprm->mm; - ax->d.type = AUDIT_EXECVE; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_EXECVE; + context->execve.argc = bprm->argc; + context->execve.mm = bprm->mm; } -- cgit v1.2.3-70-g09d2 From 9410d228a4cf434305306746bb799fb7acdd8648 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 30 Oct 2013 18:05:24 -0400 Subject: audit: call audit_bprm() only once to add AUDIT_EXECVE information Move the audit_bprm() call from search_binary_handler() to exec_binprm(). This allows us to get rid of the mm member of struct audit_aux_data_execve since bprm->mm will equal current->mm. This also mitigates the issue that ->argc could be modified by the load_binary() call in search_binary_handler(). audit_bprm() was being called to add an AUDIT_EXECVE record to the audit context every time search_binary_handler() was recursively called. Only one reference is necessary. Reported-by: Oleg Nesterov Cc: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- This patch is against 3.11, but was developed on Oleg's post-3.11 patches that introduce exec_binprm(). --- fs/exec.c | 5 +---- include/linux/audit.h | 9 +++------ kernel/audit.h | 1 - kernel/auditsc.c | 4 ---- 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb48..c5c24f2fc44 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1383,10 +1383,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - retval = audit_bprm(bprm); - if (retval) - return retval; - /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); @@ -1408,6 +1404,7 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) { + audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); } diff --git a/include/linux/audit.h b/include/linux/audit.h index 08b38bf13eb..a40641954c2 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -238,11 +238,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } -static inline int audit_bprm(struct linux_binprm *bprm) +static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); - return 0; } static inline int audit_socketcall(int nargs, unsigned long *args) { @@ -369,10 +368,8 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } -static inline int audit_bprm(struct linux_binprm *bprm) -{ - return 0; -} +static inline void audit_bprm(struct linux_binprm *bprm) +{ } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; diff --git a/kernel/audit.h b/kernel/audit.h index e7b94ab66c4..b779642b29a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -199,7 +199,6 @@ struct audit_context { } mmap; struct { int argc; - struct mm_struct *mm; } execve; }; int fds[2]; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 425a8939be1..dfc5d6745ee 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1145,9 +1145,6 @@ static void audit_log_execve_info(struct audit_context *context, const char __user *p; char *buf; - if (context->execve.mm != current->mm) - return; /* execve failed, no additional info */ - p = (const char __user *)current->mm->arg_start; audit_log_format(*ab, "argc=%d", context->execve.argc); @@ -2144,7 +2141,6 @@ void __audit_bprm(struct linux_binprm *bprm) context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; - context->execve.mm = bprm->mm; } -- cgit v1.2.3-70-g09d2 From 9175c9d2aed528800175ef81c90569d00d23f9be Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 6 Nov 2013 10:47:17 -0500 Subject: audit: fix type of sessionid in audit_set_loginuid() sfr pointed out that with CONFIG_UIDGID_STRICT_TYPE_CHECKS set the audit tree would not build. This is because the oldsessionid in audit_set_loginuid() was accidentally being declared as a kuid_t. This patch fixes that declaration mistake. Example of problem: kernel/auditsc.c: In function 'audit_set_loginuid': kernel/auditsc.c:2003:15: error: incompatible types when assigning to type 'kuid_t' from type 'int' oldsessionid = audit_get_sessionid(current); Reported-by: Stephen Rothwell Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dfc5d6745ee..90594c9f755 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1995,8 +1995,8 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - unsigned int sessionid = -1; - kuid_t oldloginuid, oldsessionid; + unsigned int oldsessionid, sessionid = (unsigned int)-1; + kuid_t oldloginuid; int rc; oldloginuid = audit_get_loginuid(current); -- cgit v1.2.3-70-g09d2 From 6a0c7cd33075f6b7f1d80145bb19812beb3fc5c9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Nov 2013 23:26:58 +0100 Subject: PM / Hibernate: Do not crash kernel in free_basic_memory_bitmaps() I have received a report about the BUG_ON() in free_basic_memory_bitmaps() triggering mysteriously during an aborted s2disk hibernation attempt. The only way I can explain that is that /dev/snapshot was first opened for writing (resume mode), then closed and then opened again for reading and closed again without freezing tasks. In that case the first invocation of snapshot_open() would set the free_bitmaps flag in snapshot_state, which is a static variable. That flag wouldn't be cleared later and the second invocation of snapshot_open() would just leave it like that, so the subsequent snapshot_release() would see data->frozen set and free_basic_memory_bitmaps() would be called unnecessarily. To prevent that from happening clear data->free_bitmaps in snapshot_open() when the file is being opened for reading (hibernate mode). In addition to that, replace the BUG_ON() in free_basic_memory_bitmaps() with a WARN_ON() as the kernel can continue just fine if the condition checked by that macro occurs. Fixes: aab172891542 (PM / hibernate: Fix user space driven resume regression) Reported-by: Oliver Lorenz Signed-off-by: Rafael J. Wysocki Cc: 3.12+ # 3.12+ --- kernel/power/snapshot.c | 3 ++- kernel/power/user.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cf..ac2d1f69490 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad..ffc931c384a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; + data->free_bitmaps = false; error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); -- cgit v1.2.3-70-g09d2 From b26d4cd385fc51e8844e2cdf9ba2051f5bba11a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 18:47:37 -0400 Subject: consolidate simple ->d_delete() instances Rename simple_delete_dentry() to always_delete_dentry() and export it. Export simple_dentry_operations, while we are at it, and get rid of their duplicates Signed-off-by: Al Viro --- arch/ia64/kernel/perfmon.c | 8 +------- fs/9p/vfs_dentry.c | 19 +------------------ fs/configfs/dir.c | 12 +----------- fs/efivarfs/super.c | 11 +---------- fs/hostfs/hostfs_kern.c | 11 +---------- fs/libfs.c | 12 +++++++----- fs/proc/generic.c | 18 +----------------- fs/proc/namespaces.c | 8 +------- include/linux/fs.h | 2 ++ kernel/cgroup.c | 7 +------ net/sunrpc/rpc_pipe.c | 11 +---------- 11 files changed, 18 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5a9ff1c3c3e..cb592773c78 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2166,12 +2166,6 @@ static const struct file_operations pfm_file_ops = { .flush = pfm_flush }; -static int -pfmfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]", @@ -2179,7 +2173,7 @@ static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = pfmfs_dname, }; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f039b104a98..b03dd23feda 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -42,23 +42,6 @@ #include "v9fs_vfs.h" #include "fid.h" -/** - * v9fs_dentry_delete - called when dentry refcount equals 0 - * @dentry: dentry in question - * - * By returning 1 here we should remove cacheing of unused - * dentry components. - * - */ - -static int v9fs_dentry_delete(const struct dentry *dentry) -{ - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); - - return 1; -} - /** * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question @@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = v9fs_dentry_delete, + .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 277bd1be21f..4522e075577 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -66,19 +66,9 @@ static void configfs_d_iput(struct dentry * dentry, iput(inode); } -/* - * We _must_ delete our dentries on last dput, as the chain-to-parent - * behavior is required to clear the parents of default_groups. - */ -static int configfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - /* simple_delete_dentry() isn't exported */ - .d_delete = configfs_d_delete, + .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a8766b880c0..becc725a195 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -/* - * Retaining negative dentries for an in-memory filesystem just wastes - * memory and lookup time: arrange for them to be deleted immediately. - */ -static int efivarfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = efivarfs_delete_dentry, + .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 25437280a20..db23ce1bd90 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) -static int hostfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations hostfs_dentry_ops = { - .d_delete = hostfs_d_delete, -}; - /* Changed in hostfs_args before the kernel starts running */ static char *root_ino = ""; static int append = 0; @@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &hostfs_dentry_ops; + sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as by sprintf: avoid that. */ diff --git a/fs/libfs.c b/fs/libfs.c index 5de06947ba5..a1844244246 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs); * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(const struct dentry *dentry) +int always_delete_dentry(const struct dentry *dentry) { return 1; } +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already @@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry) */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - static const struct dentry_operations simple_dentry_operations = { - .d_delete = simple_delete_dentry, - }; - if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 737e15615b0..cca93b6fb9a 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -174,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = { .follow_link = proc_follow_link, }; -/* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(const struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_dentry_operations); + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 49a7fff2e83..9ae46b87470 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = { .setattr = proc_setattr, }; -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; @@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) const struct dentry_operations ns_dentry_operations = { - .d_delete = ns_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = ns_dname, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index bf5d574ebdf..121f11f001c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2622,7 +2622,9 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); +extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0839bcd48c..4c62513fe19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -895,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1486,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d0d14a04dce..bf04b30a788 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -471,15 +471,6 @@ struct rpc_filelist { umode_t mode; }; -static int rpc_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations rpc_dentry_operations = { - .d_delete = rpc_delete_dentry, -}; - static struct inode * rpc_get_inode(struct super_block *sb, umode_t mode) { @@ -1266,7 +1257,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &rpc_dentry_operations; + sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); -- cgit v1.2.3-70-g09d2 From d689fe222a858c767cb8594faf280048e532b53f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Nov 2013 21:01:57 +0100 Subject: NOHZ: Check for nohz active instead of nohz enabled RCU and the fine grained idle time accounting functions check tick_nohz_enabled. But that variable is merily telling that NOHZ has been enabled in the config and not been disabled on the command line. But it does not tell anything about nohz being active. That's what all this should check for. Matthew reported, that the idle accounting on his old P1 machine showed bogus values, when he enabled NOHZ in the config and did not disable it on the kernel command line. The reason is that his machine uses (refined) jiffies as a clocksource which explains why the "fine" grained accounting went into lala land, because it depends on when the system goes and leaves idle relative to the jiffies increment. Provide a tick_nohz_active indicator and let RCU and the accounting code use this instead of tick_nohz_enable. Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Paul E. McKenney Cc: john.stultz@linaro.org Cc: mwhitehe@redhat.com Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311132052240.30673@ionos.tec.linutronix.de --- kernel/rcu/tree_plugin.h | 4 ++-- kernel/time/tick-sched.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03dff5c..08a76523243 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* * Try to advance callbacks for all flavors of RCU on the current CPU, but @@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f83..a12df5abde0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -799,11 +799,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -973,7 +968,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +976,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.2.3-70-g09d2 From da554eba2e68c8ec051977db5ee1f42d384a01ed Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 15 Nov 2013 14:15:31 -0800 Subject: timer: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6582b82fa96..accfd241b9e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 050ded1bbaea3331745cf2782315f5bc2582d083 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2013 14:15:33 -0800 Subject: tick: Document tick_do_timer_cpu Taken straight from a tglx email ;) Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0..162b03ab0ad 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* -- cgit v1.2.3-70-g09d2 From d5b5f391d434c5cc8bcb1ab2d759738797b85f52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 16:23:04 +0100 Subject: ftrace, perf: Avoid infinite event generation loop Vince's perf-trinity fuzzer found yet another 'interesting' problem. When we sample the irq_work_exit tracepoint with period==1 (or PERF_SAMPLE_PERIOD) and we add an fasync SIGNAL handler we create an infinite event generation loop: ,-> | irq_work_exit() -> | trace_irq_work_exit() -> | ... | __perf_event_overflow() -> (due to fasync) | irq_work_queue() -> (irq_work_list must be empty) '--------- arch_irq_work_raise() Similar things can happen due to regular poll() wakeups if we exceed the ring-buffer wakeup watermark, or have an event_limit. To avoid this, dis-allow sampling this particular tracepoint. In order to achieve this, create a special perf_perm function pointer for each event and call this (when set) on trying to create a tracepoint perf event. [ roasted: use expr... to allow for ',' in your expression ] Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Dave Jones Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131114152304.GC5364@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/irq_vectors.h | 11 +++++++++++ include/linux/ftrace_event.h | 16 ++++++++++++++++ include/linux/tracepoint.h | 4 ++++ include/trace/ftrace.h | 7 +++++++ kernel/trace/trace_event_perf.c | 6 ++++++ 5 files changed, 44 insertions(+) (limited to 'kernel') diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 2874df24e7a..4cab890007a 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -71,6 +71,17 @@ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); */ DEFINE_IRQ_VECTOR_EVENT(irq_work); +/* + * We must dis-allow sampling irq_work_exit() because perf event sampling + * itself can cause irq_work, which would lead to an infinite loop; + * + * 1) irq_work_exit happens + * 2) generates perf sample + * 3) generates irq_work + * 4) goto 1 + */ +TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); + /* * call_function - called when entering/exiting a call function interrupt * vector handler diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 9abbe630c45..8c9b7a1c413 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -248,6 +248,9 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + + int (*perf_perm)(struct ftrace_event_call *, + struct perf_event *); #endif }; @@ -317,6 +320,19 @@ struct ftrace_event_file { } \ early_initcall(trace_init_flags_##name); +#define __TRACE_EVENT_PERF_PERM(name, expr...) \ + static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + struct perf_event *p_event) \ + { \ + return ({ expr; }); \ + } \ + static int __init trace_init_perf_perm_##name(void) \ + { \ + event_##name.perf_perm = &perf_perm_##name; \ + return 0; \ + } \ + early_initcall(trace_init_perf_perm_##name); + #define PERF_MAX_TRACE_SIZE 2048 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index ebeab360d85..f16dc0a4004 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -267,6 +267,8 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT_FLAGS(event, flag) +#define TRACE_EVENT_PERF_PERM(event, expr...) + #endif /* DECLARE_TRACE */ #ifndef TRACE_EVENT @@ -399,4 +401,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT_FLAGS(event, flag) +#define TRACE_EVENT_PERF_PERM(event, expr...) + #endif /* ifdef TRACE_EVENT (see note above) */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 52594b20179..6b852f60f8a 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -90,6 +90,10 @@ #define TRACE_EVENT_FLAGS(name, value) \ __TRACE_EVENT_FLAGS(name, value) +#undef TRACE_EVENT_PERF_PERM +#define TRACE_EVENT_PERF_PERM(name, expr...) \ + __TRACE_EVENT_PERF_PERM(name, expr) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) @@ -140,6 +144,9 @@ #undef TRACE_EVENT_FLAGS #define TRACE_EVENT_FLAGS(event, flag) +#undef TRACE_EVENT_PERF_PERM +#define TRACE_EVENT_PERF_PERM(event, expr...) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 78e27e3b52a..630889f68b1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,12 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) -- cgit v1.2.3-70-g09d2 From 06db0b21712f878b808480ef31097637013bbf0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 13:14:47 +0200 Subject: perf: Remove fragile swevent hlist optimization Currently we only allocate a single cpu hashtable for per-cpu swevents; do away with this optimization for it is fragile in the face of things like perf_pmu_migrate_context(). The easiest thing is to make sure all CPUs are consistent wrt state. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130913111447.GN31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d724e7757cd..72348dc192c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); -- cgit v1.2.3-70-g09d2 From 0022cedd4a7d8a87841351e2b018bb6794cf2e67 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 15 Nov 2013 12:39:45 -0500 Subject: perf/trace: Properly use u64 to hold event_id The 64-bit attr.config value for perf trace events was being copied into an "int" before doing a comparison, meaning the top 32 bits were being truncated. As far as I can tell this didn't cause any errors, but it did mean it was possible to create valid aliases for all the tracepoint ids which I don't think was intended. (For example, 0xffffffff00000018 and 0x18 both enable the same tracepoint). Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1311151236100.11932@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 630889f68b1..e854f420e03 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -179,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); -- cgit v1.2.3-70-g09d2 From 9abf24d465180f5f2eb26a43545348262f16b771 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 12 Nov 2013 22:11:26 +0530 Subject: sched: Check sched_domain before computing group power After commit 863bffc80898 ("sched/fair: Fix group power_orig computation"), we can dereference rq->sd before it is set. Fix this by falling back to power_of() in this case and add a comment explaining things. Signed-off-by: Srikar Dronamraju [ Added comment and tweaked patch. ] Signed-off-by: Peter Zijlstra Cc: mikey@neuling.org Link: http://lkml.kernel.org/r/20131113151718.GN21461@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8b652ebe02..fd773ade1a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group *sg = cpu_rq(cpu)->sd->groups; + struct sched_group_power *sgp; + struct rq *rq = cpu_rq(cpu); - power_orig += sg->sgp->power_orig; - power += sg->sgp->power; + /* + * build_sched_domains() -> init_sched_groups_power() + * gets here before we've attached the domains to the + * runqueues. + * + * Use power_of(), which is set irrespective of domains + * in update_cpu_power(). + * + * This avoids power/power_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct power_orig. + */ + if (unlikely(!rq->sd)) { + power_orig += power_of(cpu); + power += power_of(cpu); + continue; + } + + sgp = rq->sd->groups->sgp; + power_orig += sgp->power_orig; + power += sgp->power; } } else { /* -- cgit v1.2.3-70-g09d2 From 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:41:49 +0100 Subject: sched: Avoid NULL dereference on sd_busy Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some conditions leading to a possible NULL deref in set_cpu_sd_state_idle(). Reported-by: Anton Blanchard Cc: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131118113701.GF3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1808606ee5..a1591ca7eb5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); + sd = sd->parent; /* sd_busy */ } + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; -- cgit v1.2.3-70-g09d2 From 0515973ffb16c2852a1bb1df2ca1456556faaaa5 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 17 Nov 2013 12:12:36 +0900 Subject: sched: Fix a trivial typo in comments Fix a trivial typo in rq_attach_root(). Signed-off-by: Shigeru Yoshida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131117.121236.1990617639803941055.shigeru.yoshida@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1591ca7eb5..718730dd048 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4762,7 +4762,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ -- cgit v1.2.3-70-g09d2 From 4be77398ac9d948773116b6be4a3c91b3d6ea18c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Nov 2013 11:44:51 -0800 Subject: time: Fix 1ns/tick drift w/ GENERIC_TIME_VSYSCALL_OLD Since commit 1e75fa8be9f (time: Condense timekeeper.xtime into xtime_sec - merged in v3.6), there has been an problem with the error accounting in the timekeeping code, such that when truncating to nanoseconds, we round up to the next nsec, but the balancing adjustment to the ntp_error value was dropped. This causes 1ns per tick drift forward of the clock. In 3.7, this logic was isolated to only GENERIC_TIME_VSYSCALL_OLD architectures (s390, ia64, powerpc). The fix is simply to balance the accounting and to subtract the added nanosecond from ntp_error. This allows the internal long-term clock steering to keep the clock accurate. While this fix removes the regression added in 1e75fa8be9f, the ideal solution is to move away from GENERIC_TIME_VSYSCALL_OLD and use the new VSYSCALL method, which avoids entirely the nanosecond granular rounding, and the resulting short-term clock adjustment oscillation needed to keep long term accurate time. [ jstultz: Many thanks to Martin for his efforts identifying this subtle bug, and providing the fix. ] Originally-from: Martin Schwidefsky Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Fenghua Yu Cc: Thomas Gleixner Cc: stable #v3.6+ Link: http://lkml.kernel.org/r/1385149491-20307-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3abf53418b6..87b4f00284c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) -- cgit v1.2.3-70-g09d2 From e5fca243abae1445afbfceebda5f08462ef869d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 17:14:39 -0500 Subject: cgroup: use a dedicated workqueue for cgroup destruction Since be44562613851 ("cgroup: remove synchronize_rcu() from cgroup_diput()"), cgroup destruction path makes use of workqueue. css freeing is performed from a work item from that point on and a later commit, ea15f8ccdb430 ("cgroup: split cgroup destruction into two steps"), moves css offlining to workqueue too. As cgroup destruction isn't depended upon for memory reclaim, the destruction work items were put on the system_wq; unfortunately, some controller may block in the destruction path for considerable duration while holding cgroup_mutex. As large part of destruction path is synchronized through cgroup_mutex, when combined with high rate of cgroup removals, this has potential to fill up system_wq's max_active of 256. Also, it turns out that memcg's css destruction path ends up queueing and waiting for work items on system_wq through work_on_cpu(). If such operation happens while system_wq is fully occupied by cgroup destruction work items, work_on_cpu() can't make forward progress because system_wq is full and other destruction work items on system_wq can't make forward progress because the work item waiting for work_on_cpu() is holding cgroup_mutex, leading to deadlock. This can be fixed by queueing destruction work items on a separate workqueue. This patch creates a dedicated workqueue - cgroup_destroy_wq - for this purpose. As these work items shouldn't have inter-dependencies and mostly serialized by cgroup_mutex anyway, giving high concurrency level doesn't buy anything and the workqueue's @max_active is set to 1 so that destruction work items are executed one by one on each CPU. Hugh Dickins: Because cgroup_init() is run before init_workqueues(), cgroup_destroy_wq can't be allocated from cgroup_init(). Do it from a separate core_initcall(). In the future, we probably want to reorder so that workqueue init happens before cgroup_init(). Signed-off-by: Tejun Heo Reported-by: Hugh Dickins Reported-by: Shawn Bohrer Link: http://lkml.kernel.org/r/20131111220626.GA7509@sbohrermbp13-local.rgmadvisors.com Link: http://lkml.kernel.org/g/alpine.LNX.2.00.1310301606080.2333@eggly.anvils Cc: stable@vger.kernel.org # v3.9+ --- kernel/cgroup.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c62513fe19..a7b98ee35ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -89,6 +89,14 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +/* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + /* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are @@ -871,7 +879,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -4249,7 +4257,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4539,7 +4547,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@ -5063,6 +5071,22 @@ out: return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy -- cgit v1.2.3-70-g09d2 From 91151228065354a050fd0d190aefdd662a0580aa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 14 Nov 2013 12:56:18 +0100 Subject: workqueue: swap set_cpus_allowed_ptr() and PF_NO_SETAFFINITY Move the setting of PF_NO_SETAFFINITY up before set_cpus_allowed() in create_worker(). Otherwise userland can change ->cpus_allowed in between. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03eb..f8942429268 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1736,16 +1736,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + set_user_nice(worker->task, pool->attrs->nice); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ - set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* * The caller is responsible for ensuring %POOL_DISASSOCIATED * remains stable across this function. See the comments above the -- cgit v1.2.3-70-g09d2 From 8a2b75384444488fc4f2cbb9f0921b6a0794838f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Sep 2013 12:30:04 -0400 Subject: workqueue: fix ordered workqueues in NUMA setups An ordered workqueue implements execution ordering by using single pool_workqueue with max_active == 1. On a given pool_workqueue, work items are processed in FIFO order and limiting max_active to 1 enforces the queued work items to be processed one by one. Unfortunately, 4c16bd327c ("workqueue: implement NUMA affinity for unbound workqueues") accidentally broke this guarantee by applying NUMA affinity to ordered workqueues too. On NUMA setups, an ordered workqueue would end up with separate pool_workqueues for different nodes. Each pool_workqueue still limits max_active to 1 but multiple work items may be executed concurrently and out of order depending on which node they are queued to. Fix it by using dedicated ordered_wq_attrs[] when creating ordered workqueues. The new attrs match the unbound ones except that no_numa is always set thus forcing all NUMA nodes to share the default pool_workqueue. While at it, add sanity check in workqueue creation path which verifies that an ordered workqueues has only the default pool_workqueue. Signed-off-by: Tejun Heo Reported-by: Libin Cc: stable@vger.kernel.org Cc: Lai Jiangshan --- kernel/workqueue.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f8942429268..bbb5e9832d8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -4107,7 +4110,7 @@ out_unlock: static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; - int cpu; + int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4127,6 +4130,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; + } else if (wq->flags & __WQ_ORDERED) { + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5052,13 +5062,23 @@ static int __init init_workqueues(void) } } - /* create default unbound wq attrs */ + /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; + + /* + * An ordered wq should have only one pwq as ordering is + * guaranteed by max_active which is enforced by pwqs. + * Turn off NUMA so that dfl_pwq is used for all nodes. + */ + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + attrs->no_numa = true; + ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); -- cgit v1.2.3-70-g09d2 From 9ef28a73ff6a1598d6f915973c282fe28291f800 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Mon, 9 Sep 2013 13:13:58 +0800 Subject: workqueue: fix comment typo for __queue_work() It seems the "dying" should be "draining" here. Signed-off-by: Li Bin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bbb5e9832d8..73bdf3c1f9b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1323,7 +1323,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); - /* if dying, only works from the same workqueue are allowed */ + /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; -- cgit v1.2.3-70-g09d2 From 4e8b22bd1a37447712f1b1d96352fc53b463c6b3 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Tue, 10 Sep 2013 09:52:35 +0800 Subject: workqueue: fix pool ID allocation leakage and remove BUILD_BUG_ON() in init_workqueues When one work starts execution, the high bits of work's data contain pool ID. It can represent a maximum of WORK_OFFQ_POOL_NONE. Pool ID is assigned WORK_OFFQ_POOL_NONE when the work being initialized indicating that no pool is associated and get_work_pool() uses it to check the associated pool. So if worker_pool_assign_id() assigns a ID greater than or equal WORK_OFFQ_POOL_NONE to a pool, it triggers leakage, and it may break the non-reentrance guarantee. This patch fix this issue by modifying the worker_pool_assign_id() function calling idr_alloc() by setting @end param WORK_OFFQ_POOL_NONE. Furthermore, in the current implementation, the BUILD_BUG_ON() in init_workqueues makes no sense. The number of worker pools needed cannot be determined at compile time, because the number of backing pools for UNBOUND workqueues is dynamic based on the assigned custom attributes. So remove it. tj: Minor comment and indentation updates. Signed-off-by: Li Bin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 73bdf3c1f9b..c66912be990 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -521,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); - ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, + GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; @@ -5020,10 +5027,6 @@ static int __init init_workqueues(void) int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; - /* make sure we have enough bits for OFFQ pool ID */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < - WORK_CPU_END * NR_STD_WORKER_POOLS); - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3-70-g09d2 From ac01810c9d2814238f08a227062e66a35a0e1ea2 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Mon, 25 Nov 2013 19:39:47 +0530 Subject: irq: Enable all irqs unconditionally in irq_resume When the system enters suspend, it disables all interrupts in suspend_device_irqs(), including the interrupts marked EARLY_RESUME. On the resume side things are different. The EARLY_RESUME interrupts are reenabled in sys_core_ops->resume and the non EARLY_RESUME interrupts are reenabled in the normal system resume path. When suspend_noirq() failed or suspend is aborted for any other reason, we might omit the resume side call to sys_core_ops->resume() and therefor the interrupts marked EARLY_RESUME are not reenabled and stay disabled forever. To solve this, enable all irqs unconditionally in irq_resume() regardless whether interrupts marked EARLY_RESUMEhave been already enabled or not. This might try to reenable already enabled interrupts in the non failure case, but the only affected platform is XEN and it has been confirmed that it does not cause any side effects. [ tglx: Massaged changelog. ] Signed-off-by: Laxman Dewangan Acked-by-and-tested-by: Konrad Rzeszutek Wilk Acked-by: Heiko Stuebner Reviewed-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1385388587-16442-1-git-send-email-ldewangan@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf2176..abcd6ca86cb 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (is_early != want_early) + if (!is_early && want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); -- cgit v1.2.3-70-g09d2 From 12997d1a999cd1b22e21a238c96780f2a55e4e13 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 18 Nov 2013 11:00:29 -0700 Subject: Revert "workqueue: allow work_on_cpu() to be called recursively" This reverts commit c2fda509667b0fda4372a237f5a59ea4570b1627. c2fda509667b removed lockdep annotation from work_on_cpu() to work around the PCI path that calls work_on_cpu() from within a work_on_cpu() work item (PF driver .probe() method -> pci_enable_sriov() -> add VFs -> VF driver .probe method). 961da7fb6b22 ("PCI: Avoid unnecessary CPU switch when calling driver .probe() method) avoids that recursive work_on_cpu() use in a different way, so this revert restores the work_on_cpu() lockdep annotation. Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo --- kernel/workqueue.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03eb..5690b8eabfb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2840,19 +2840,6 @@ already_gone: return false; } -static bool __flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } -} - /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2866,10 +2853,18 @@ static bool __flush_work(struct work_struct *work) */ bool flush_work(struct work_struct *work) { + struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - return __flush_work(work); + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } } EXPORT_SYMBOL_GPL(flush_work); @@ -4814,14 +4809,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - - /* - * The work item is on-stack and can't lead to deadlock through - * flushing. Use __flush_work() to avoid spurious lockdep warnings - * when work_on_cpu()s are nested. - */ - __flush_work(&wfc.work); - + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.3-70-g09d2 From 8a56d7761d2d041ae5e8215d20b4167d8aa93f51 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 25 Nov 2013 20:59:46 -0500 Subject: ftrace: Fix function graph with loading of modules Commit 8c4f3c3fa9681 "ftrace: Check module functions being traced on reload" fixed module loading and unloading with respect to function tracing, but it missed the function graph tracer. If you perform the following # cd /sys/kernel/debug/tracing # echo function_graph > current_tracer # modprobe nfsd # echo nop > current_tracer You'll get the following oops message: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 2910 at /linux.git/kernel/trace/ftrace.c:1640 __ftrace_hash_rec_update.part.35+0x168/0x1b9() Modules linked in: nfsd exportfs nfs_acl lockd ipt_MASQUERADE sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables uinput snd_hda_codec_idt CPU: 2 PID: 2910 Comm: bash Not tainted 3.13.0-rc1-test #7 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 0000000000000668 ffff8800787efcf8 ffffffff814fe193 ffff88007d500000 0000000000000000 ffff8800787efd38 ffffffff8103b80a 0000000000000668 ffffffff810b2b9a ffffffff81a48370 0000000000000001 ffff880037aea000 Call Trace: [] dump_stack+0x4f/0x7c [] warn_slowpath_common+0x81/0x9b [] ? __ftrace_hash_rec_update.part.35+0x168/0x1b9 [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.35+0x168/0x1b9 [] ? __mutex_lock_slowpath+0x364/0x364 [] ftrace_shutdown+0xd7/0x12b [] unregister_ftrace_graph+0x49/0x78 [] graph_trace_reset+0xe/0x10 [] tracing_set_tracer+0xa7/0x26a [] tracing_set_trace_write+0x8b/0xbd [] ? ftrace_return_to_handler+0xb2/0xde [] ? __sb_end_write+0x5e/0x5e [] vfs_write+0xab/0xf6 [] ftrace_graph_caller+0x85/0x85 [] SyS_write+0x59/0x82 [] ftrace_graph_caller+0x85/0x85 [] system_call_fastpath+0x16/0x1b ---[ end trace 940358030751eafb ]--- The above mentioned commit didn't go far enough. Well, it covered the function tracer by adding checks in __register_ftrace_function(). The problem is that the function graph tracer circumvents that (for a slight efficiency gain when function graph trace is running with a function tracer. The gain was not worth this). The problem came with ftrace_startup() which should always be called after __register_ftrace_function(), if you want this bug to be completely fixed. Anyway, this solution moves __register_ftrace_function() inside of ftrace_startup() and removes the need to call them both. Reported-by: Dave Wysochanski Fixes: ed926f9b35cd ("ftrace: Use counters to enable functions to trace") Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 64 ++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 22fa5569676..0e9f9eaade2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (unlikely(ftrace_disabled)) - return -ENODEV; - if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - if (ftrace_disabled) - return -ENODEV; - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; @@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; + ret = __register_ftrace_function(ops); + if (ret) + return ret; + ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; @@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command) { bool hash_disable = true; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; ftrace_start_up--; /* @@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) } if (!command || !ftrace_enabled) - return; + return 0; ftrace_run_update_code(command); + return 0; } static void ftrace_startup_sysctl(void) @@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { - int ret; int i; if (!ftrace_probe_registered) @@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } @@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ }) -# define ftrace_shutdown(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) + # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); + ret = ftrace_startup(ops, 0); mutex_unlock(&ftrace_lock); @@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); + ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } +/* Just a place holder for function graph */ +static struct ftrace_ops fgraph_ops __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | + FTRACE_OPS_FL_RECURSION_SAFE, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3-70-g09d2 From 32e475d76a3e40879cd9ee4f69b19615062280d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Nov 2013 12:41:44 +0100 Subject: sched: Expose preempt_schedule_irq() Tony reported that aa0d53260596 ("ia64: Use preempt_schedule_irq") broke PREEMPT=n builds on ia64. Ok, wrapped my brain around it. I tripped over the magic asm foo which has a single need_resched check and schedule point for both sys call return and interrupt return. So you need the schedule_preempt_irq() for kernel preemption from interrupt return while on a normal syscall preemption a schedule would be sufficient. But using schedule_preempt_irq() is not harmful here in any way. It just sets the preempt_active bit also in cases where it would not be required. Even on preempt=n kernels adding the preempt_active bit is completely harmless. So instead of having an extra function, moving the existing one out of the ifdef PREEMPT looks like the sanest thing to do. It would also allow getting rid of various other sti/schedule/cli asm magic in other archs. Reported-and-Tested-by: Tony Luck Fixes: aa0d53260596 ("ia64: Use preempt_schedule_irq") Signed-off-by: Thomas Gleixner [slightly edited Changelog] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311211230030.30673@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 718730dd048..e85cda20ab2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -#endif /* CONFIG_PREEMPT */ - int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { -- cgit v1.2.3-70-g09d2 From 0fc0287c9ed1ffd3706f8b4d9b314aa102ef1245 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Nov 2013 15:03:41 +0100 Subject: cpuset: Fix memory allocator deadlock Juri hit the below lockdep report: [ 4.303391] ====================================================== [ 4.303392] [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] [ 4.303394] 3.12.0-dl-peterz+ #144 Not tainted [ 4.303395] ------------------------------------------------------ [ 4.303397] kworker/u4:3/689 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 4.303399] (&p->mems_allowed_seq){+.+...}, at: [] new_slab+0x6c/0x290 [ 4.303417] [ 4.303417] and this task is already holding: [ 4.303418] (&(&q->__queue_lock)->rlock){..-...}, at: [] blk_execute_rq_nowait+0x5b/0x100 [ 4.303431] which would create a new lock dependency: [ 4.303432] (&(&q->__queue_lock)->rlock){..-...} -> (&p->mems_allowed_seq){+.+...} [ 4.303436] [ 4.303898] the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: [ 4.303918] -> (&p->mems_allowed_seq){+.+...} ops: 2762 { [ 4.303922] HARDIRQ-ON-W at: [ 4.303923] [] __lock_acquire+0x65a/0x1ff0 [ 4.303926] [] lock_acquire+0x93/0x140 [ 4.303929] [] kthreadd+0x86/0x180 [ 4.303931] [] ret_from_fork+0x7c/0xb0 [ 4.303933] SOFTIRQ-ON-W at: [ 4.303933] [] __lock_acquire+0x68c/0x1ff0 [ 4.303935] [] lock_acquire+0x93/0x140 [ 4.303940] [] kthreadd+0x86/0x180 [ 4.303955] [] ret_from_fork+0x7c/0xb0 [ 4.303959] INITIAL USE at: [ 4.303960] [] __lock_acquire+0x344/0x1ff0 [ 4.303963] [] lock_acquire+0x93/0x140 [ 4.303966] [] kthreadd+0x86/0x180 [ 4.303969] [] ret_from_fork+0x7c/0xb0 [ 4.303972] } Which reports that we take mems_allowed_seq with interrupts enabled. A little digging found that this can only be from cpuset_change_task_nodemask(). This is an actual deadlock because an interrupt doing an allocation will hit get_mems_allowed()->...->__read_seqcount_begin(), which will spin forever waiting for the write side to complete. Cc: John Stultz Cc: Mel Gorman Reported-by: Juri Lelli Signed-off-by: Peter Zijlstra Tested-by: Juri Lelli Acked-by: Li Zefan Acked-by: Mel Gorman Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cpuset.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e13c4..4772034b4b1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) + if (need_loop) { + local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); + } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; - if (need_loop) + if (need_loop) { write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); + } task_unlock(tsk); } -- cgit v1.2.3-70-g09d2 From e605b36575e896edd8161534550c9ea021b03bc0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Nov 2013 18:16:21 -0500 Subject: cgroup: fix cgroup_subsys_state leak for seq_files If a cgroup file implements either read_map() or read_seq_string(), such file is served using seq_file by overriding file->f_op to cgroup_seqfile_operations, which also overrides the release method to single_release() from cgroup_file_release(). Because cgroup_file_open() didn't use to acquire any resources, this used to be fine, but since f7d58818ba42 ("cgroup: pin cgroup_subsys_state when opening a cgroupfs file"), cgroup_file_open() pins the css (cgroup_subsys_state) which is put by cgroup_file_release(). The patch forgot to update the release path for seq_files and each open/release cycle leaks a css reference. Fix it by updating cgroup_file_release() to also handle seq_files and using it for seq_file release path too. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v3.12 --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7b98ee35ef..8b729c278b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -199,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -2429,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = single_release, + .release = cgroup_file_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2490,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) ret = cft->release(inode, file); if (css->ss) css_put(css); + if (file->f_op == &cgroup_seqfile_operations) + single_release(inode, file); return ret; } -- cgit v1.2.3-70-g09d2 From 5ecbe3c3c690b5ab493c730c317475287a9e8b45 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Nov 2013 09:16:33 +0100 Subject: kernel/extable: fix address-checks for core_kernel and init areas The init_kernel_text() and core_kernel_text() functions should not include the labels _einittext and _etext when checking if an address is inside the .text or .init sections. Signed-off-by: Helge Deller Signed-off-by: Linus Torvalds --- kernel/extable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28105b..763faf037ec 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) static inline int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + addr < (unsigned long)_einittext) return 1; return 0; } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) + addr < (unsigned long)_etext) return 1; if (system_state == SYSTEM_BOOTING && -- cgit v1.2.3-70-g09d2 From 0e576acbc1d9600cf2d9b4a141a2554639959d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Nov 2013 12:18:13 +0100 Subject: nohz: Fix another inconsistency between CONFIG_NO_HZ=n and nohz=off If CONFIG_NO_HZ=n tick_nohz_get_sleep_length() returns NSEC_PER_SEC/HZ. If CONFIG_NO_HZ=y and the nohz functionality is disabled via the command line option "nohz=off" or not enabled due to missing hardware support, then tick_nohz_get_sleep_length() returns 0. That happens because ts->sleep_length is never set in that case. Set it to NSEC_PER_SEC/HZ when the NOHZ mode is inactive. Reported-by: Michal Hocko Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a12df5abde0..ea20f7d1ac2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; -- cgit v1.2.3-70-g09d2 From 3ccb01239201af06a07482ec686b14cd148102a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Dec 2013 12:41:20 -0500 Subject: tracing: Only run synchronize_sched() at instance deletion time It has been reported that boot up with FTRACE_SELFTEST enabled can take a very long time. There can be stalls of over a minute. This was tracked down to the synchronize_sched() called when a system call event is disabled. As the self tests enable and disable thousands of events, this makes the synchronize_sched() get called thousands of times. The synchornize_sched() was added with d562aff93bfb53 "tracing: Add support for SOFT_DISABLE to syscall events" which caused this regression (added in 3.13-rc1). The synchronize_sched() is to protect against the events being accessed when a tracer instance is being deleted. When an instance is being deleted all the events associated to it are unregistered. The synchronize_sched() makes sure that no more users are running when it finishes. Instead of calling synchronize_sched() for all syscall events, we only need to call it once, after the events are unregistered and before the instance is deleted. The event_mutex is held during this action to prevent new users from enabling events. Link: http://lkml.kernel.org/r/20131203124120.427b9661@gandalf.local.home Reported-by: Petr Mladek Acked-by: Tom Zanussi Acked-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_syscalls.c | 10 ---------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f919a2e21bf..a11800ae96d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e4b6d11bdf7..ea90eb5f6f1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) -- cgit v1.2.3-70-g09d2 From 4fc9bbf98fd66f879e628d8537ba7c240be2b58e Mon Sep 17 00:00:00 2001 From: Khalid Aziz Date: Wed, 27 Nov 2013 15:19:25 -0700 Subject: PCI: Disable Bus Master only on kexec reboot Add a flag to tell the PCI subsystem that kernel is shutting down in preparation to kexec a kernel. Add code in PCI subsystem to use this flag to clear Bus Master bit on PCI devices only in case of kexec reboot. This fixes a power-off problem on Acer Aspire V5-573G and likely other machines and avoids any other issues caused by clearing Bus Master bit on PCI devices in normal shutdown path. The problem was introduced by b566a22c2332 ("PCI: disable Bus Master on PCI device shutdown"). This patch is based on discussion at http://marc.info/?l=linux-pci&m=138425645204355&w=2 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63861 Reported-by: Chang Liu Signed-off-by: Khalid Aziz Signed-off-by: Bjorn Helgaas Acked-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org # v3.5+ --- drivers/pci/pci-driver.c | 12 +++++++++--- include/linux/kexec.h | 3 +++ kernel/kexec.c | 4 ++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 7edd5c30744..25f0bc65916 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -415,12 +416,17 @@ static void pci_device_shutdown(struct device *dev) pci_msi_shutdown(pci_dev); pci_msix_shutdown(pci_dev); +#ifdef CONFIG_KEXEC /* - * Turn off Bus Master bit on the device to tell it to not - * continue to do DMA. Don't touch devices in D3cold or unknown states. + * If this is a kexec reboot, turn off Bus Master bit on the + * device to tell it to not continue to do DMA. Don't touch + * devices in D3cold or unknown states. + * If it is not a kexec reboot, firmware will hit the PCI + * devices with big hammer and stop their DMA any way. */ - if (pci_dev->current_state <= PCI_D3hot) + if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) pci_clear_master(pci_dev); +#endif } #ifdef CONFIG_PM diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d78d28a733b..5fd33dc1fe3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -198,6 +198,9 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +/* flag to track if kexec reboot is in progress */ +extern bool kexec_in_progress; + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, diff --git a/kernel/kexec.c b/kernel/kexec.c index 490afc03627..d0d8fca5406 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; size_t vmcoreinfo_size; size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -1675,6 +1678,7 @@ int kernel_kexec(void) } else #endif { + kexec_in_progress = true; kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.2.3-70-g09d2 From 7cfe5b3310a1b45f385ff18647bddb487a6c5525 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 10 Dec 2013 17:42:50 +1030 Subject: Ignore generated file kernel/x509_certificate_list $ git status # On branch pending-rebases # Untracked files: # (use "git add ..." to include in what will be committed) # # kernel/x509_certificate_list nothing added to commit but untracked files present (use "git add" to track) $ Signed-off-by: Rusty Russell Signed-off-by: David Howells --- kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9..790d83c7d16 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,3 +5,4 @@ config_data.h config_data.gz timeconst.h hz.bc +x509_certificate_list -- cgit v1.2.3-70-g09d2 From 62226983da070f7e51068ec2e3a4da34672964c7 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 5 Dec 2013 14:48:22 +0100 Subject: KEYS: correct alignment of system_certificate_list content in assembly file Apart from data-type specific alignment constraints, there are also architecture-specific alignment requirements. For example, on s390 symbols must be on even addresses implying a 2-byte alignment. If the system_certificate_list_end symbol is on an odd address and if this address is loaded, the least-significant bit is ignored. As a result, the load_system_certificate_list() fails to load the certificates because of a wrong certificate length calculation. To be safe, align system_certificate_list on an 8-byte boundary. Also improve the length calculation of the system_certificate_list content. Introduce a system_certificate_list_size (8-byte aligned because of unsigned long) variable that stores the length. Let the linker calculate this size by introducing a start and end label for the certificate content. Signed-off-by: Hendrik Brueckner Signed-off-by: David Howells --- kernel/system_certificates.S | 14 ++++++++++++-- kernel/system_keyring.c | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S index 4aef390671c..3e9868d4753 100644 --- a/kernel/system_certificates.S +++ b/kernel/system_certificates.S @@ -3,8 +3,18 @@ __INITRODATA + .align 8 .globl VMLINUX_SYMBOL(system_certificate_list) VMLINUX_SYMBOL(system_certificate_list): +__cert_list_start: .incbin "kernel/x509_certificate_list" - .globl VMLINUX_SYMBOL(system_certificate_list_end) -VMLINUX_SYMBOL(system_certificate_list_end): +__cert_list_end: + + .align 8 + .globl VMLINUX_SYMBOL(system_certificate_list_size) +VMLINUX_SYMBOL(system_certificate_list_size): +#ifdef CONFIG_64BIT + .quad __cert_list_end - __cert_list_start +#else + .long __cert_list_end - __cert_list_start +#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 564dd93430a..52ebc70263f 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -22,7 +22,7 @@ struct key *system_trusted_keyring; EXPORT_SYMBOL_GPL(system_trusted_keyring); extern __initconst const u8 system_certificate_list[]; -extern __initconst const u8 system_certificate_list_end[]; +extern __initconst const unsigned long system_certificate_list_size; /* * Load the compiled-in keys @@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void) pr_notice("Loading compiled-in X.509 certificates\n"); - end = system_certificate_list_end; p = system_certificate_list; + end = p + system_certificate_list_size; while (p < end) { /* Each cert begins with an ASN.1 SEQUENCE tag and must be more * than 256 bytes in size. -- cgit v1.2.3-70-g09d2 From f12d5bfceb7e1f9051563381ec047f7f13956c3c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:38:42 -0800 Subject: futex: fix handling of read-only-mapped hugepages The hugepage code had the exact same bug that regular pages had in commit 7485d0d3758e ("futexes: Remove rw parameter from get_futex_key()"). The regular page case was fixed by commit 9ea71503a8ed ("futex: Fix regression with read only mappings"), but the transparent hugepage case (added in a5b338f2b0b1: "thp: update futex compound knowledge") case remained broken. Found by Dave Jones and his trinity tool. Reported-and-tested-by: Dave Jones Cc: stable@kernel.org # v2.6.38+ Acked-by: Thomas Gleixner Cc: Mel Gorman Cc: Darren Hart Cc: Andrea Arcangeli Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 80ba086f021..02febad0079 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -288,7 +288,7 @@ again: put_page(page); /* serialize against __split_huge_page_splitting() */ local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { page_head = compound_head(page); /* * page_head is valid pointer but we must pin -- cgit v1.2.3-70-g09d2 From 5cdec2d833748fbd27d3682f7209225c504c79c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:53:51 -0800 Subject: futex: move user address verification up to common code When debugging the read-only hugepage case, I was confused by the fact that get_futex_key() did an access_ok() only for the non-shared futex case, since the user address checking really isn't in any way specific to the private key handling. Now, it turns out that the shared key handling does effectively do the equivalent checks inside get_user_pages_fast() (it doesn't actually check the address range on x86, but does check the page protections for being a user page). So it wasn't actually a bug, but the fact that we treat the address differently for private and shared futexes threw me for a loop. Just move the check up, so that it gets done for both cases. Also, use the 'rw' parameter for the type, even if it doesn't actually matter any more (it's a historical artifact of the old racy i386 "page faults from kernel space don't check write protections"). Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 02febad0079..f6ff0191ecf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; key->private.mm = mm; key->private.address = address; get_futex_key_refs(key); -- cgit v1.2.3-70-g09d2