From c84a3b27798dfce928b867fa1c9f3c3fd66f0a31 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 23 Nov 2013 18:01:46 -0500 Subject: sysfs: drop kobj_ns_type handling, take #2 The way namespace tags are implemented in sysfs is more complicated than necessary. As each tag is a pointer value and required to be non-NULL under a namespace enabled parent, there's no need to record separately what type each tag is. If multiple namespace types are needed, which currently aren't, we can simply compare the tag to a set of allowed tags in the superblock assuming that the tags, being pointers, won't have the same value across multiple types. This patch rips out kobj_ns_type handling from sysfs. sysfs now has an enable switch to turn on namespace under a node. If enabled, all children are required to have non-NULL namespace tags and filtered against the super_block's tag. kobject namespace determination is now performed in lib/kobject.c::create_dir() making sysfs_read_ns_type() unnecessary. The sanity checks are also moved. create_dir() is restructured to ease such addition. This removes most kobject namespace knowledge from sysfs proper which will enable proper separation and layering of sysfs. This is the second try. The first one was cb26a311578e ("sysfs: drop kobj_ns_type handling") which tried to automatically enable namespace if there are children with non-NULL namespace tags; however, it was broken for symlinks as they should inherit the target's tag iff namespace is enabled in the parent. This led to namespace filtering enabled incorrectly for wireless net class devices through phy80211 symlinks and thus network configuration failure. a1212d278c05 ("Revert "sysfs: drop kobj_ns_type handling"") reverted the commit. This shouldn't introduce any behavior changes, for real. v2: Dummy implementation of sysfs_enable_ns() for !CONFIG_SYSFS was missing and caused build failure. Reported by kbuild test robot. Signed-off-by: Tejun Heo Reported-by: Linus Torvalds Cc: Eric W. Biederman Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: kbuild test robot Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 5b4b8886435..16e9335b32d 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -65,13 +65,17 @@ static int populate_dir(struct kobject *kobj) static int create_dir(struct kobject *kobj) { + const struct kobj_ns_type_operations *ops; int error; error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj)); - if (!error) { - error = populate_dir(kobj); - if (error) - sysfs_remove_dir(kobj); + if (error) + return error; + + error = populate_dir(kobj); + if (error) { + sysfs_remove_dir(kobj); + return error; } /* @@ -80,7 +84,20 @@ static int create_dir(struct kobject *kobj) */ sysfs_get(kobj->sd); - return error; + /* + * If @kobj has ns_ops, its children need to be filtered based on + * their namespace tags. Enable namespace support on @kobj->sd. + */ + ops = kobj_child_ns_ops(kobj); + if (ops) { + BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE); + BUG_ON(ops->type >= KOBJ_NS_TYPES); + BUG_ON(!kobj_ns_type_registered(ops->type)); + + sysfs_enable_ns(kobj->sd); + } + + return 0; } static int get_kobj_path_length(struct kobject *kobj) -- cgit v1.2.3-18-g5258 From 93b2b8e4aa4317e3fe6414d117deb5f3c362e8bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 Nov 2013 14:54:15 -0500 Subject: sysfs, kernfs: introduce kernfs_create_dir[_ns]() Introduce kernfs interface to manipulate a directory which takes and returns sysfs_dirents. create_dir() is renamed to kernfs_create_dir_ns() and its argumantes and return value are updated. create_dir() usages are replaced with kernfs_create_dir_ns() and sysfs_create_subdir() usages are replaced with kernfs_create_dir(). Dup warnings are handled explicitly by sysfs users of the kernfs interface. sysfs_enable_ns() is renamed to kernfs_enable_ns(). This patch doesn't introduce any behavior changes. v2: Dummy implementation for !CONFIG_SYSFS updated to return -ENOSYS. v3: kernfs_enable_ns() added. v4: Refreshed on top of "sysfs: drop kobj_ns_type handling, take #2" so that this patch removes sysfs_enable_ns(). Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 16e9335b32d..b8d848fb137 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -94,7 +94,7 @@ static int create_dir(struct kobject *kobj) BUG_ON(ops->type >= KOBJ_NS_TYPES); BUG_ON(!kobj_ns_type_registered(ops->type)); - sysfs_enable_ns(kobj->sd); + kernfs_enable_ns(kobj->sd); } return 0; -- cgit v1.2.3-18-g5258 From 89c86a64cd056e283323710c9ddf6f7090a450c8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Dec 2013 17:37:51 -0700 Subject: kobject: delay kobject release for random time When CONFIG_DEBUG_KOBJECT_RELEASE=y, delay kobject release functions for a random time between 1 and 8 seconds, which effectively changes the order in which they're called. Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index b8d848fb137..1d110dc95db 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -18,6 +18,7 @@ #include #include #include +#include /** * kobject_namespace - return @kobj's namespace tag @@ -642,10 +643,12 @@ static void kobject_release(struct kref *kref) { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE - pr_info("kobject: '%s' (%p): %s, parent %p (delayed)\n", - kobject_name(kobj), kobj, __func__, kobj->parent); + unsigned long delay = HZ + HZ * (get_random_int() & 0x3); + pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", + kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); - schedule_delayed_work(&kobj->release, HZ); + + schedule_delayed_work(&kobj->release, delay); #else kobject_cleanup(kobj); #endif -- cgit v1.2.3-18-g5258 From 35a5fe695b07ae899510ad76fdf0aeaef85fe951 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Dec 2013 17:38:00 -0700 Subject: kobject: remove kset from sysfs immediately in kset_unregister() There's no "unlink from sysfs" interface for ksets, so I think callers of kset_unregister() expect the kset to be removed from sysfs immediately, without waiting for the last reference to be released. This patch makes the sysfs removal happen immediately, so the caller may create a new kset with the same name as soon as kset_unregister() returns. Without this, every caller has to call "kobject_del(&kset->kobj)" first unless it knows it will never create a new kset with the same name. This sometimes shows up on module unload and reload, where the reload fails because it tries to create a kobject with the same name as one from the original load that still exists. CONFIG_DEBUG_KOBJECT_RELEASE=y makes this problem easier to hit. Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 1d110dc95db..98b45bb33c8 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -855,6 +855,7 @@ void kset_unregister(struct kset *k) { if (!k) return; + kobject_del(&k->kobj); kobject_put(&k->kobj); } -- cgit v1.2.3-18-g5258 From 0d6077f8b48ed2dce8f2466a76c0d574a3b4dbe9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 26 Nov 2013 12:43:37 +0800 Subject: lib/scatterlist: export sg_miter_skip() sg_copy_buffer() can't meet demand for some drrivers(such usb mass storage), so we have to use the sg_miter_* APIs to access sg buffer, then need export sg_miter_skip() for these drivers. The API is needed for converting to sg_miter_* APIs in USB storage driver for accessing sg buffer. Acked-by: Andrew Morton Cc: FUJITA Tomonori Cc: Jens Axboe Signed-off-by: Ming Lei Reviewed-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- lib/scatterlist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index d16fa295ae1..3a8e8e8fb2a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -495,7 +495,7 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) * true if @miter contains the valid mapping. false if end of sg * list is reached. */ -static bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) +bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) { sg_miter_stop(miter); @@ -513,6 +513,7 @@ static bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) return true; } +EXPORT_SYMBOL(sg_miter_skip); /** * sg_miter_next - proceed mapping iterator to the next mapping -- cgit v1.2.3-18-g5258 From 020d30f17f196dcbf0c2c68a874345e8885a3149 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 8 Nov 2013 15:28:25 +0100 Subject: kobject: fix memory leak in kobject_set_name_vargs If the call to kvasprintf fails then the old name of the object will be leaked, this patch fixes the bug by restoring the old name before returning ENOMEM. Signed-off-by: Maurizio Lombardi Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 98b45bb33c8..94b321f4ac6 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -265,8 +265,10 @@ int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, return 0; kobj->name = kvasprintf(GFP_KERNEL, fmt, vargs); - if (!kobj->name) + if (!kobj->name) { + kobj->name = old_name; return -ENOMEM; + } /* ewww... some of these buggers have '/' in the name ... */ while ((s = strchr(kobj->name, '/'))) -- cgit v1.2.3-18-g5258 From 324a56e16e44baecac3ca799fd216154145c14bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Dec 2013 14:11:53 -0500 Subject: kernfs: s/sysfs_dirent/kernfs_node/ and rename its friends accordingly kernfs has just been separated out from sysfs and we're already in full conflict mode. Nothing can make the situation any worse. Let's take the chance to name things properly. This patch performs the following renames. * s/sysfs_elem_dir/kernfs_elem_dir/ * s/sysfs_elem_symlink/kernfs_elem_symlink/ * s/sysfs_elem_attr/kernfs_elem_file/ * s/sysfs_dirent/kernfs_node/ * s/sd/kn/ in kernfs proper * s/parent_sd/parent/ * s/target_sd/target/ * s/dir_sd/parent/ * s/to_sysfs_dirent()/rb_to_kn()/ * misc renames of local vars when they conflict with the above Because md, mic and gpio dig into sysfs details, this patch ends up modifying them. All are sysfs_dirent renames and trivial. While we can avoid these by introducing a dummy wrapping struct sysfs_dirent around kernfs_node, given the limited usage outside kernfs and sysfs proper, I don't think such workaround is called for. This patch is strictly rename only and doesn't introduce any functional difference. - mic / gpio renames were missing. Spotted by kbuild test robot. Signed-off-by: Tejun Heo Cc: Neil Brown Cc: Linus Walleij Cc: Ashutosh Dixit Cc: kbuild test robot Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 94b321f4ac6..064451f2a6c 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -556,7 +556,7 @@ out: */ void kobject_del(struct kobject *kobj) { - struct sysfs_dirent *sd; + struct kernfs_node *sd; if (!kobj) return; -- cgit v1.2.3-18-g5258 From 71ae8aac3e198c6f3577cb7ad3a17f6128e97bfa Mon Sep 17 00:00:00 2001 From: Francesco Fusco Date: Thu, 12 Dec 2013 16:09:05 +0100 Subject: lib: introduce arch optimized hash library We introduce a new hashing library that is meant to be used in the contexts where speed is more important than uniformity of the hashed values. The hash library leverages architecture specific implementation to achieve high performance and fall backs to jhash() for the generic case. On Intel-based x86 architectures, the library can exploit the crc32l instruction, part of the Intel SSE4.2 instruction set, if the instruction is supported by the processor. This implementation is twice as fast as the jhash() implementation on an i7 processor. Additional architectures, such as Arm64 provide instructions for accelerating the computation of CRC, so they could be added as well in follow-up work. Signed-off-by: Francesco Fusco Signed-off-by: Daniel Borkmann Signed-off-by: Thomas Graf Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- lib/Makefile | 2 +- lib/hash.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 lib/hash.c (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index a459c31e8c6..d0f79c547d9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o percpu_ida.o + percpu-refcount.o percpu_ida.o hash.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o diff --git a/lib/hash.c b/lib/hash.c new file mode 100644 index 00000000000..b89f06a2d60 --- /dev/null +++ b/lib/hash.c @@ -0,0 +1,38 @@ +/* General purpose hashing library + * + * That's a start of a kernel hashing library, which can be extended + * with further algorithms in future. arch_fast_hash{2,}() will + * eventually resolve to an architecture optimized implementation. + * + * Copyright 2013 Francesco Fusco + * Copyright 2013 Daniel Borkmann + * Copyright 2013 Thomas Graf + * Licensed under the GNU General Public License, version 2.0 (GPLv2) + */ + +#include +#include + +static struct fast_hash_ops arch_hash_ops __read_mostly = { + .hash = jhash, + .hash2 = jhash2, +}; + +u32 arch_fast_hash(const void *data, u32 len, u32 seed) +{ + return arch_hash_ops.hash(data, len, seed); +} +EXPORT_SYMBOL_GPL(arch_fast_hash); + +u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed) +{ + return arch_hash_ops.hash2(data, len, seed); +} +EXPORT_SYMBOL_GPL(arch_fast_hash2); + +static int __init hashlib_init(void) +{ + setup_arch_fast_hash(&arch_hash_ops); + return 0; +} +early_initcall(hashlib_init); -- cgit v1.2.3-18-g5258 From 237217546d44fe06c16b8895eecaef22f18ba5ee Mon Sep 17 00:00:00 2001 From: Francesco Fusco Date: Wed, 18 Dec 2013 16:05:48 +0100 Subject: lib: hash: follow-up fixups for arch hash This patch adds the include file to pull in __read_mostly on some architectures e.g. ppc and also fixes up signatures in generic asm. Reported-by: Fengguang Wu Signed-off-by: Francesco Fusco Signed-off-by: David S. Miller --- lib/hash.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/hash.c b/lib/hash.c index b89f06a2d60..fea973f4bd5 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -12,6 +12,7 @@ #include #include +#include static struct fast_hash_ops arch_hash_ops __read_mostly = { .hash = jhash, -- cgit v1.2.3-18-g5258 From eb4c69033fd1dda8b60e48af3accdd578c2e59ed Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 4 Jan 2014 19:56:40 -0800 Subject: Revert "kobject: introduce kobj_completion" This reverts commit eee031649707db3c9920d9498f8d03819b74fc23. Jeff writes: I have no objections to reverting it. There were concerns from Al Viro that it'd be tough to get right by callers and I had assumed it got dropped after that. I had planned on using it in my btrfs sysfs exports patchset but came up with a better way. Cc: Jeff Mahoney Cc: Al Viro Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 064451f2a6c..f7f69cbe4f6 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include @@ -781,55 +780,6 @@ const struct sysfs_ops kobj_sysfs_ops = { .store = kobj_attr_store, }; -/** - * kobj_completion_init - initialize a kobj_completion object. - * @kc: kobj_completion - * @ktype: type of kobject to initialize - * - * kobj_completion structures can be embedded within structures with different - * lifetime rules. During the release of the enclosing object, we can - * wait on the release of the kobject so that we don't free it while it's - * still busy. - */ -void kobj_completion_init(struct kobj_completion *kc, struct kobj_type *ktype) -{ - init_completion(&kc->kc_unregister); - kobject_init(&kc->kc_kobj, ktype); -} -EXPORT_SYMBOL_GPL(kobj_completion_init); - -/** - * kobj_completion_release - release a kobj_completion object - * @kobj: kobject embedded in kobj_completion - * - * Used with kobject_release to notify waiters that the kobject has been - * released. - */ -void kobj_completion_release(struct kobject *kobj) -{ - struct kobj_completion *kc = kobj_to_kobj_completion(kobj); - complete(&kc->kc_unregister); -} -EXPORT_SYMBOL_GPL(kobj_completion_release); - -/** - * kobj_completion_del_and_wait - release the kobject and wait for it - * @kc: kobj_completion object to release - * - * Delete the kobject from sysfs and drop the reference count. Then wait - * until any other outstanding references are also dropped. This routine - * is only necessary once other references may have been taken on the - * kobject. Typically this happens when the kobject has been published - * to sysfs via kobject_add. - */ -void kobj_completion_del_and_wait(struct kobj_completion *kc) -{ - kobject_del(&kc->kc_kobj); - kobject_put(&kc->kc_kobj); - wait_for_completion(&kc->kc_unregister); -} -EXPORT_SYMBOL_GPL(kobj_completion_del_and_wait); - /** * kset_register - initialize and add a kset. * @k: kset. -- cgit v1.2.3-18-g5258 From 0cb637bff80d5ba2b916bb19f19ffd59cd4079fd Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 Dec 2013 14:05:01 -0500 Subject: swiotlb: Don't DoS us with 'swiotlb buffer is full' (v2) There is no need for that so lets use ratelimiting. Also add some extra information to be helpful. Acked-by: Stefano Stabellini [v2: s/ld/zs on the printk] Signed-off-by: Konrad Rzeszutek Wilk --- lib/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e4399fa65ad..4634ac9cdb3 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -505,7 +505,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); - dev_warn(hwdev, "swiotlb buffer is full\n"); + if (printk_ratelimit()) + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); return SWIOTLB_MAP_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); -- cgit v1.2.3-18-g5258 From 9705710e40b9e5acaf003f90d6ed883ba193b314 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 4 Jan 2014 14:20:04 +0100 Subject: kobject: Fix source code comment spelling Signed-off-by: Bart Van Assche Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index f7f69cbe4f6..b0b26665c61 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -365,7 +365,7 @@ static int kobject_add_varg(struct kobject *kobj, struct kobject *parent, * * If @parent is set, then the parent of the @kobj will be set to it. * If @parent is NULL, then the parent of the @kobj will be set to the - * kobject associted with the kset assigned to this kobject. If no kset + * kobject associated with the kset assigned to this kobject. If no kset * is assigned to the kobject, then the kobject will be located in the * root of the sysfs tree. * -- cgit v1.2.3-18-g5258 From 8bc588e0e585bc9085df75e84d4d5635f45cf360 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 22 Dec 2013 11:34:22 +0100 Subject: firewire: ohci: Turn remote DMA support into a module parameter This makes it possible to debug kernel over FireWire without the need to recompile it. [Stefan R: changed description from "...0" to "...N"] Cc: Dave Hansen Signed-off-by: Lubomir Rintel Signed-off-by: Stefan Richter --- lib/Kconfig.debug | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index db25707aa41..dc30284a3b0 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1547,17 +1547,6 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. -config FIREWIRE_OHCI_REMOTE_DMA - bool "Remote debugging over FireWire with firewire-ohci" - depends on FIREWIRE_OHCI - help - This option lets you use the FireWire bus for remote debugging - with help of the firewire-ohci driver. It enables unfiltered - remote DMA in firewire-ohci. - See Documentation/debugging-via-ohci1394.txt for more information. - - If unsure, say N. - config BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK -- cgit v1.2.3-18-g5258 From 03144b5869d2921dafbea477246aeb5d4eb5fc38 Mon Sep 17 00:00:00 2001 From: Michael Dalton Date: Thu, 16 Jan 2014 22:23:29 -0800 Subject: lib: Ensure EWMA does not store wrong intermediate values To ensure ewma_read() without a lock returns a valid but possibly out of date average, modify ewma_add() by using ACCESS_ONCE to prevent intermediate wrong values from being written to avg->internal. Suggested-by: Eric Dumazet Acked-by: Michael S. Tsirkin Acked-by: Eric Dumazet Signed-off-by: Michael Dalton Signed-off-by: David S. Miller --- lib/average.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/average.c b/lib/average.c index 99a67e662b3..114d1beae0c 100644 --- a/lib/average.c +++ b/lib/average.c @@ -53,8 +53,10 @@ EXPORT_SYMBOL(ewma_init); */ struct ewma *ewma_add(struct ewma *avg, unsigned long val) { - avg->internal = avg->internal ? - (((avg->internal << avg->weight) - avg->internal) + + unsigned long internal = ACCESS_ONCE(avg->internal); + + ACCESS_ONCE(avg->internal) = internal ? + (((internal << avg->weight) - internal) + (val << avg->factor)) >> avg->weight : (val << avg->factor); return avg; -- cgit v1.2.3-18-g5258 From 82ef3d5d5f3ffd757c960693c4fe7a0051211849 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Thu, 16 Jan 2014 17:24:31 +0800 Subject: net: fix "queues" uevent between network namespaces When I create a new namespace with 'ip netns add net0', or add/remove new links in a namespace with 'ip link add/delete type veth', rx/tx queues events can be got in all namespaces. That is because rx/tx queue ktypes do not have namespace support, and their kobj parents are setted to NULL. This patch is to fix it. Reported-by: Libo Chen Signed-off-by: Libo Chen Signed-off-by: Weilong Chen Acked-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 52e5abbc41d..5f72767ddd9 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -88,11 +88,17 @@ out: #ifdef CONFIG_NET static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) { - struct kobject *kobj = data; + struct kobject *kobj = data, *ksobj; const struct kobj_ns_type_operations *ops; ops = kobj_ns_ops(kobj); - if (ops) { + if (!ops && kobj->kset) { + ksobj = &kobj->kset->kobj; + if (ksobj->parent != NULL) + ops = kobj_ns_ops(ksobj->parent); + } + + if (ops && ops->netlink_ns && kobj->ktype->namespace) { const void *sock_ns, *ns; ns = kobj->ktype->namespace(kobj); sock_ns = ops->netlink_ns(dsk); -- cgit v1.2.3-18-g5258 From 687b0ad2751ca8ea418396fa780e22571fba76a8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Jan 2014 13:13:26 -0800 Subject: percpu-refcount: Add a WARN() for ref going negative AIO had a missing get, which led to an ioctx leak - after percpu_ref_kill() the ref was 0 so percpu_ref_put() never saw it hit 0. This wasn't noticed at the time because it all happened completely silently, this adds a WARN() which would've caught the aio bug. tj: Used WARN_ONCE() instead of WARN(). Signed-off-by: Kent Overstreet Signed-off-by: Tejun Heo --- lib/percpu-refcount.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 1a53d497a8c..963b7034a51 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -120,6 +120,9 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", + atomic_read(&ref->count)); + /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) ref->confirm_kill(ref); -- cgit v1.2.3-18-g5258 From 0abdd7a81b7e3fd781d7fabcca49501852bba17e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 Jan 2014 15:48:12 -0800 Subject: dma-debug: introduce debug_dma_assert_idle() Record actively mapped pages and provide an api for asserting a given page is dma inactive before execution proceeds. Placing debug_dma_assert_idle() in cow_user_page() flagged the violation of the dma-api in the NET_DMA implementation (see commit 77873803363c "net_dma: mark broken"). The implementation includes the capability to count, in a limited way, repeat mappings of the same page that occur without an intervening unmap. This 'overlap' counter is limited to the few bits of tag space in a radix tree. This mechanism is added to mitigate false negative cases where, for example, a page is dma mapped twice and debug_dma_assert_idle() is called after the page is un-mapped once. Signed-off-by: Dan Williams Cc: Joerg Roedel Cc: Vinod Koul Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 12 +++- lib/dma-debug.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 190 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6982094a7e7..900b63c1e89 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1584,8 +1584,16 @@ config DMA_API_DEBUG With this option you will be able to detect common bugs in device drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This option causes a performance degredation. Use only if you want - to debug device drivers. If unsure, say N. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. source "samples/Kconfig" diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d87a17a819d..c38083871f1 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -53,11 +53,26 @@ enum map_err_types { #define DMA_DEBUG_STACKTRACE_ENTRIES 5 +/** + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping + * @list: node on pre-allocated free_entries list + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @type: single, page, sg, coherent + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn + * @size: length of the mapping + * @direction: enum dma_data_direction + * @sg_call_ents: 'nents' from dma_map_sg + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @map_err_type: track whether dma_mapping_error() was checked + * @stacktrace: support backtraces when a violation is detected + */ struct dma_debug_entry { struct list_head list; struct device *dev; int type; - phys_addr_t paddr; + unsigned long pfn; + size_t offset; u64 dev_addr; u64 size; int direction; @@ -372,6 +387,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry) list_del(&entry->list); } +static unsigned long long phys_addr(struct dma_debug_entry *entry) +{ + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; +} + /* * Dump mapping entries for debugging purposes */ @@ -389,9 +409,9 @@ void debug_dma_dump_mappings(struct device *dev) list_for_each_entry(entry, &bucket->list, list) { if (!dev || dev == entry->dev) { dev_info(entry->dev, - "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n", + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", type2name[entry->type], idx, - (unsigned long long)entry->paddr, + phys_addr(entry), entry->pfn, entry->dev_addr, entry->size, dir2name[entry->direction], maperr2str[entry->map_err_type]); @@ -403,6 +423,133 @@ void debug_dma_dump_mappings(struct device *dev) } EXPORT_SYMBOL(debug_dma_dump_mappings); +/* + * For each page mapped (initial page in the case of + * dma_alloc_coherent/dma_map_{single|page}, or each page in a + * scatterlist) insert into this tree using the pfn as the key. At + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If + * the pfn already exists at insertion time add a tag as a reference + * count for the overlapping mappings. For now, the overlap tracking + * just ensures that 'unmaps' balance 'maps' before marking the pfn + * idle, but we should also be flagging overlaps as an API violation. + * + * Memory usage is mostly constrained by the maximum number of available + * dma-debug entries in that we need a free dma_debug_entry before + * inserting into the tree. In the case of dma_map_{single|page} and + * dma_alloc_coherent there is only one dma_debug_entry and one pfn to + * track per event. dma_map_sg(), on the other hand, + * consumes a single dma_debug_entry, but inserts 'nents' entries into + * the tree. + * + * At any time debug_dma_assert_idle() can be called to trigger a + * warning if the given page is in the active set. + */ +static RADIX_TREE(dma_active_pfn, GFP_NOWAIT); +static DEFINE_SPINLOCK(radix_lock); +#define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) + +static int active_pfn_read_overlap(unsigned long pfn) +{ + int overlap = 0, i; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (radix_tree_tag_get(&dma_active_pfn, pfn, i)) + overlap |= 1 << i; + return overlap; +} + +static int active_pfn_set_overlap(unsigned long pfn, int overlap) +{ + int i; + + if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0) + return 0; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (overlap & 1 << i) + radix_tree_tag_set(&dma_active_pfn, pfn, i); + else + radix_tree_tag_clear(&dma_active_pfn, pfn, i); + + return overlap; +} + +static void active_pfn_inc_overlap(unsigned long pfn) +{ + int overlap = active_pfn_read_overlap(pfn); + + overlap = active_pfn_set_overlap(pfn, ++overlap); + + /* If we overflowed the overlap counter then we're potentially + * leaking dma-mappings. Otherwise, if maps and unmaps are + * balanced then this overflow may cause false negatives in + * debug_dma_assert_idle() as the pfn may be marked idle + * prematurely. + */ + WARN_ONCE(overlap == 0, + "DMA-API: exceeded %d overlapping mappings of pfn %lx\n", + ACTIVE_PFN_MAX_OVERLAP, pfn); +} + +static int active_pfn_dec_overlap(unsigned long pfn) +{ + int overlap = active_pfn_read_overlap(pfn); + + return active_pfn_set_overlap(pfn, --overlap); +} + +static int active_pfn_insert(struct dma_debug_entry *entry) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&radix_lock, flags); + rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry); + if (rc == -EEXIST) + active_pfn_inc_overlap(entry->pfn); + spin_unlock_irqrestore(&radix_lock, flags); + + return rc; +} + +static void active_pfn_remove(struct dma_debug_entry *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&radix_lock, flags); + if (active_pfn_dec_overlap(entry->pfn) == 0) + radix_tree_delete(&dma_active_pfn, entry->pfn); + spin_unlock_irqrestore(&radix_lock, flags); +} + +/** + * debug_dma_assert_idle() - assert that a page is not undergoing dma + * @page: page to lookup in the dma_active_pfn tree + * + * Place a call to this routine in cases where the cpu touching the page + * before the dma completes (page is dma_unmapped) will lead to data + * corruption. + */ +void debug_dma_assert_idle(struct page *page) +{ + unsigned long flags; + struct dma_debug_entry *entry; + + if (!page) + return; + + spin_lock_irqsave(&radix_lock, flags); + entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page)); + spin_unlock_irqrestore(&radix_lock, flags); + + if (!entry) + return; + + err_printk(entry->dev, entry, + "DMA-API: cpu touching an active dma mapped page " + "[pfn=0x%lx]\n", entry->pfn); +} + /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. @@ -411,10 +558,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) { struct hash_bucket *bucket; unsigned long flags; + int rc; bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); put_hash_bucket(bucket, &flags); + + rc = active_pfn_insert(entry); + if (rc == -ENOMEM) { + pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n"); + global_disable = true; + } + + /* TODO: report -EEXIST errors here as overlapping mappings are + * not supported by the DMA API + */ } static struct dma_debug_entry *__dma_entry_alloc(void) @@ -469,6 +627,8 @@ static void dma_entry_free(struct dma_debug_entry *entry) { unsigned long flags; + active_pfn_remove(entry); + /* * add to beginning of the list - this way the entries are * more likely cache hot when they are reallocated. @@ -895,15 +1055,15 @@ static void check_unmap(struct dma_debug_entry *ref) ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); } else if ((entry->type == dma_debug_coherent) && - (ref->paddr != entry->paddr)) { + (phys_addr(ref) != phys_addr(entry))) { err_printk(ref->dev, entry, "DMA-API: device driver frees " "DMA memory with different CPU address " "[device address=0x%016llx] [size=%llu bytes] " "[cpu alloc address=0x%016llx] " "[cpu free address=0x%016llx]", ref->dev_addr, ref->size, - (unsigned long long)entry->paddr, - (unsigned long long)ref->paddr); + phys_addr(entry), + phys_addr(ref)); } if (ref->sg_call_ents && ref->type == dma_debug_sg && @@ -1052,7 +1212,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_page; - entry->paddr = page_to_phys(page) + offset; + entry->pfn = page_to_pfn(page); + entry->offset = offset, entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1148,7 +1309,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; - entry->paddr = sg_phys(s); + entry->pfn = page_to_pfn(sg_page(s)); + entry->offset = s->offset, entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; @@ -1198,7 +1360,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = dir, @@ -1233,7 +1396,8 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, entry->type = dma_debug_coherent; entry->dev = dev; - entry->paddr = virt_to_phys(virt); + entry->pfn = page_to_pfn(virt_to_page(virt)); + entry->offset = (size_t) virt & PAGE_MASK; entry->size = size; entry->dev_addr = dma_addr; entry->direction = DMA_BIDIRECTIONAL; @@ -1248,7 +1412,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size, struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, - .paddr = virt_to_phys(virt), + .pfn = page_to_pfn(virt_to_page(virt)), + .offset = (size_t) virt & PAGE_MASK, .dev_addr = addr, .size = size, .direction = DMA_BIDIRECTIONAL, @@ -1356,7 +1521,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, @@ -1388,7 +1554,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(s), + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, -- cgit v1.2.3-18-g5258 From aec6a8889a98a0cd58357cd0937a25189908f191 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:49:13 -0800 Subject: mm, show_mem: remove SHOW_MEM_FILTER_PAGE_COUNT Commit 4b59e6c47309 ("mm, show_mem: suppress page counts in non-blockable contexts") introduced SHOW_MEM_FILTER_PAGE_COUNT to suppress PFN walks on large memory machines. Commit c78e93630d15 ("mm: do not walk all of system memory during show_mem") avoided a PFN walk in the generic show_mem helper which removes the requirement for SHOW_MEM_FILTER_PAGE_COUNT in that case. This patch removes PFN walkers from the arch-specific implementations that report on a per-node or per-zone granularity. ARM and unicore32 still do a PFN walk as they report memory usage on each bank which is a much finer granularity where the debugging information may still be of use. As the remaining arches doing PFN walks have relatively small amounts of memory, this patch simply removes SHOW_MEM_FILTER_PAGE_COUNT. [akpm@linux-foundation.org: fix parisc] Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Tony Luck Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index 5847a4921b8..f58689f5a24 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -17,9 +17,6 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); - if (filter & SHOW_MEM_FILTER_PAGE_COUNT) - return; - for_each_online_pgdat(pgdat) { unsigned long flags; int zoneid; -- cgit v1.2.3-18-g5258 From 457ff1de2d247d9b8917c4664c2325321a35e313 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:30 -0800 Subject: lib/swiotlb.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/swiotlb.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e4399fa65ad..615f3de4b5c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) /* * Get the overflow emergency buffer */ - v_overflow_buffer = alloc_bootmem_low_pages_nopanic( - PAGE_ALIGN(io_tlb_overflow)); + v_overflow_buffer = memblock_virt_alloc_nopanic( + PAGE_ALIGN(io_tlb_overflow), + PAGE_SIZE); if (!v_overflow_buffer) return -ENOMEM; @@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + io_tlb_list = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), + PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + io_tlb_orig_addr = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), + PAGE_SIZE); if (verbose) swiotlb_print_info(); @@ -215,13 +220,13 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); + vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; if (io_tlb_start) - free_bootmem(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); pr_warn("Cannot allocate SWIOTLB buffer"); no_iotlb_memory = true; } @@ -357,14 +362,14 @@ void __init swiotlb_free(void) free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - free_bootmem_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); - free_bootmem_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - free_bootmem_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - free_bootmem_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free_late(io_tlb_overflow_buffer, + PAGE_ALIGN(io_tlb_overflow)); + memblock_free_late(__pa(io_tlb_orig_addr), + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_list), + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + memblock_free_late(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; } -- cgit v1.2.3-18-g5258 From c15295001aa940df4e3cf6574808a4addca9f2e5 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:32 -0800 Subject: lib/cpumask.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cpumask.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index d327b87c99b..b810b753c60 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = alloc_bootmem(cpumask_size()); + *mask = memblock_virt_alloc(cpumask_size(), 0); } /** @@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - free_bootmem(__pa(mask), cpumask_size()); + memblock_free_early(__pa(mask), cpumask_size()); } #endif -- cgit v1.2.3-18-g5258 From 25487d73a9670d911530eee2e31c20889ba117db Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 21 Jan 2014 15:50:57 -0800 Subject: lib/show_mem.c: show num_poisoned_pages when oom Show num_poisoned_pages when oom, it is a little helpful to find the reason. Also it will be emitted anytime show_mem() is called. Signed-off-by: Xishi Qiu Suggested-by: Naoya Horiguchi Acked-by: Michal Hocko Acked-by: David Rientjes Reviewed-by: Wanpeng Li Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index f58689f5a24..09225796991 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -43,4 +43,7 @@ void show_mem(unsigned int filter) printk("%lu pages in pagetable cache\n", quicklist_total_size()); #endif +#ifdef CONFIG_MEMORY_FAILURE + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); +#endif } -- cgit v1.2.3-18-g5258 From 809fa972fd90ff27225294b17a027e908b2d7b7a Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 22 Jan 2014 02:29:41 +0100 Subject: reciprocal_divide: update/correction of the algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jakub Zawadzki noticed that some divisions by reciprocal_divide() were not correct [1][2], which he could also show with BPF code after divisions are transformed into reciprocal_value() for runtime invariance which can be passed to reciprocal_divide() later on; reverse in BPF dump ended up with a different, off-by-one K in some situations. This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not use reciprocal divide"). This follow-up patch improves reciprocal_value() and reciprocal_divide() to work in all cases by using Granlund and Montgomery method, so that also future use is safe and without any non-obvious side-effects. Known problems with the old implementation were that division by 1 always returned 0 and some off-by-ones when the dividend and divisor where very large. This seemed to not be problematic with its current users, as far as we can tell. Eric Dumazet checked for the slab usage, we cannot surely say so in the case of flex_array. Still, in order to fix that, we propose an extension from the original implementation from commit 6a2d7a955d8d resp. [3][4], by using the algorithm proposed in "Division by Invariant Integers Using Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is, pseudocode for q = n/d where q, n, d is in u32 universe: 1) Initialization: int l = ceil(log_2 d) uword m' = floor((1<<32)*((1<>32 q = (t+((n-t)>>sh_1))>>sh_2 The assembler implementation from Agner Fog [6] also helped a lot while implementing. We have tested the implementation on x86_64, ppc64, i686, s390x; on x86_64/haswell we're still half the latency compared to normal divide. Joint work with Daniel Borkmann. [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c [3] https://gmplib.org/~tege/division-paper.pdf [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 [6] http://www.agner.org/optimize/asmlib.zip Reported-by: Jakub Zawadzki Cc: Eric Dumazet Cc: Austin S Hemmelgarn Cc: linux-kernel@vger.kernel.org Cc: Jesse Gross Cc: Jamal Hadi Salim Cc: Stephen Hemminger Cc: Matt Mackall Cc: Pekka Enberg Cc: Christoph Lameter Cc: Andy Gospodarek Cc: Veaceslav Falico Cc: Jay Vosburgh Cc: Jakub Zawadzki Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- lib/flex_array.c | 7 ++++++- lib/reciprocal_div.c | 24 ++++++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/flex_array.c b/lib/flex_array.c index 6948a6692fc..2eed22fa507 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -90,8 +90,8 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total, { struct flex_array *ret; int elems_per_part = 0; - int reciprocal_elems = 0; int max_size = 0; + struct reciprocal_value reciprocal_elems = { 0 }; if (element_size) { elems_per_part = FLEX_ARRAY_ELEMENTS_PER_PART(element_size); @@ -119,6 +119,11 @@ EXPORT_SYMBOL(flex_array_alloc); static int fa_element_to_part_nr(struct flex_array *fa, unsigned int element_nr) { + /* + * if element_size == 0 we don't get here, so we never touch + * the zeroed fa->reciprocal_elems, which would yield invalid + * results + */ return reciprocal_divide(element_nr, fa->reciprocal_elems); } diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index 75510e94f7d..464152410c5 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -1,11 +1,27 @@ +#include #include #include #include -u32 reciprocal_value(u32 k) +/* + * For a description of the algorithm please have a look at + * include/linux/reciprocal_div.h + */ + +struct reciprocal_value reciprocal_value(u32 d) { - u64 val = (1LL << 32) + (k - 1); - do_div(val, k); - return (u32)val; + struct reciprocal_value R; + u64 m; + int l; + + l = fls(d - 1); + m = ((1ULL << 32) * ((1ULL << l) - d)); + do_div(m, d); + ++m; + R.m = (u32)m; + R.sh1 = min(l, 1); + R.sh2 = max(l - 1, 0); + + return R; } EXPORT_SYMBOL(reciprocal_value); -- cgit v1.2.3-18-g5258 From 30b02c4b2a1eb6b6ca83e31c6b65f63076a03c5b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jan 2014 13:24:09 +0000 Subject: assoc_array: remove global variable The associative array code creates unnecessary and potentially problematic global variable 'status'. Remove it since never used. Signed-off-by: Stephen Hemminger Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- lib/assoc_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 1b6a44f1ec3..c0b1007011e 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -157,7 +157,7 @@ enum assoc_array_walk_status { assoc_array_walk_tree_empty, assoc_array_walk_found_terminal_node, assoc_array_walk_found_wrong_shortcut, -} status; +}; struct assoc_array_walk_result { struct { -- cgit v1.2.3-18-g5258 From 6f6b5d1ec56acdeab0503d2b823f6f88a0af493e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2014 08:26:37 +0000 Subject: percpu_ida: Make percpu_ida_alloc + callers accept task state bitmask This patch changes percpu_ida_alloc() + callers to accept task state bitmask for prepare_to_wait() for code like target/iscsi that needs it for interruptible sleep, that is provided in a subsequent patch. It now expects TASK_UNINTERRUPTIBLE when the caller is able to sleep waiting for a new tag, or TASK_RUNNING when the caller cannot sleep, and is forced to return a negative value when no tags are available. v2 changes: - Include blk-mq + tcm_fc + vhost/scsi + target/iscsi changes - Drop signal_pending_state() call v3 changes: - Only call prepare_to_wait() + finish_wait() when != TASK_RUNNING (PeterZ) Reported-by: Linus Torvalds Cc: Linus Torvalds Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Jens Axboe Signed-off-by: Kent Overstreet Cc: #3.12+ Signed-off-by: Nicholas Bellinger --- lib/percpu_ida.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 9d054bf91d0..58b671484ac 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -132,22 +132,22 @@ static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) /** * percpu_ida_alloc - allocate a tag * @pool: pool to allocate from - * @gfp: gfp flags + * @state: task state for prepare_to_wait * * Returns a tag - an integer in the range [0..nr_tags) (passed to * tag_pool_init()), or otherwise -ENOSPC on allocation failure. * * Safe to be called from interrupt context (assuming it isn't passed - * __GFP_WAIT, of course). + * TASK_UNINTERRUPTIBLE, of course). * * @gfp indicates whether or not to wait until a free id is available (it's not * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep * however long it takes until another thread frees an id (same semantics as a * mempool). * - * Will not fail if passed __GFP_WAIT. + * Will not fail if passed TASK_UNINTERRUPTIBLE. */ -int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) +int percpu_ida_alloc(struct percpu_ida *pool, int state) { DEFINE_WAIT(wait); struct percpu_ida_cpu *tags; @@ -174,7 +174,8 @@ int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) * * global lock held and irqs disabled, don't need percpu lock */ - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + if (state != TASK_RUNNING) + prepare_to_wait(&pool->wait, &wait, state); if (!tags->nr_free) alloc_global_tags(pool, tags); @@ -191,7 +192,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) spin_unlock(&pool->lock); local_irq_restore(flags); - if (tag >= 0 || !(gfp & __GFP_WAIT)) + if (tag >= 0 || state == TASK_RUNNING) break; schedule(); @@ -199,8 +200,9 @@ int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) local_irq_save(flags); tags = this_cpu_ptr(pool->tag_cpu); } + if (state != TASK_RUNNING) + finish_wait(&pool->wait, &wait); - finish_wait(&pool->wait, &wait); return tag; } EXPORT_SYMBOL_GPL(percpu_ida_alloc); -- cgit v1.2.3-18-g5258 From aace05097a0fd467230e39acb148be0fdaa90068 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 23 Jan 2014 15:54:12 -0800 Subject: lib/parser.c: add match_wildcard() function match_wildcard function is a simple implementation of wildcard matching algorithm. It only supports two usual wildcardes: '*' - matches zero or more characters '?' - matches one character This algorithm is safe since it is non-recursive. Signed-off-by: Du, Changbin Cc: Jason Baron Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/parser.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'lib') diff --git a/lib/parser.c b/lib/parser.c index 807b2aaa33f..ee5295541ce 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -192,6 +192,56 @@ int match_hex(substring_t *s, int *result) return match_number(s, result, 16); } +/** + * match_wildcard: - parse if a string matches given wildcard pattern + * @pattern: wildcard pattern + * @str: the string to be parsed + * + * Description: Parse the string @str to check if matches wildcard + * pattern @pattern. The pattern may contain two type wildcardes: + * '*' - matches zero or more characters + * '?' - matches one character + * If it's matched, return true, else return false. + */ +bool match_wildcard(const char *pattern, const char *str) +{ + const char *s = str; + const char *p = pattern; + bool star = false; + + while (*s) { + switch (*p) { + case '?': + s++; + p++; + break; + case '*': + star = true; + str = s; + if (!*++p) + return true; + pattern = p; + break; + default: + if (*s == *p) { + s++; + p++; + } else { + if (!star) + return false; + str++; + s = str; + p = pattern; + } + break; + } + } + + if (*p == '*') + ++p; + return !*p; +} + /** * match_strlcpy: - Copy the characters from a substring_t to a sized buffer * @dest: where to copy to @@ -235,5 +285,6 @@ EXPORT_SYMBOL(match_token); EXPORT_SYMBOL(match_int); EXPORT_SYMBOL(match_octal); EXPORT_SYMBOL(match_hex); +EXPORT_SYMBOL(match_wildcard); EXPORT_SYMBOL(match_strlcpy); EXPORT_SYMBOL(match_strdup); -- cgit v1.2.3-18-g5258 From a3d2cca43cd31479d4f0414b4d014c4400a4e6d6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jan 2014 15:54:13 -0800 Subject: lib/parser.c: put EXPORT_SYMBOLs in the conventional place Cc: Du, Changbin Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/parser.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/parser.c b/lib/parser.c index ee5295541ce..b6d11631231 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -113,6 +113,7 @@ int match_token(char *s, const match_table_t table, substring_t args[]) return p->token; } +EXPORT_SYMBOL(match_token); /** * match_number: scan a number in the given base from a substring_t @@ -163,6 +164,7 @@ int match_int(substring_t *s, int *result) { return match_number(s, result, 0); } +EXPORT_SYMBOL(match_int); /** * match_octal: - scan an octal representation of an integer from a substring_t @@ -177,6 +179,7 @@ int match_octal(substring_t *s, int *result) { return match_number(s, result, 8); } +EXPORT_SYMBOL(match_octal); /** * match_hex: - scan a hex representation of an integer from a substring_t @@ -191,6 +194,7 @@ int match_hex(substring_t *s, int *result) { return match_number(s, result, 16); } +EXPORT_SYMBOL(match_hex); /** * match_wildcard: - parse if a string matches given wildcard pattern @@ -241,6 +245,7 @@ bool match_wildcard(const char *pattern, const char *str) ++p; return !*p; } +EXPORT_SYMBOL(match_wildcard); /** * match_strlcpy: - Copy the characters from a substring_t to a sized buffer @@ -263,6 +268,7 @@ size_t match_strlcpy(char *dest, const substring_t *src, size_t size) } return ret; } +EXPORT_SYMBOL(match_strlcpy); /** * match_strdup: - allocate a new string with the contents of a substring_t @@ -280,11 +286,4 @@ char *match_strdup(const substring_t *s) match_strlcpy(p, s, sz); return p; } - -EXPORT_SYMBOL(match_token); -EXPORT_SYMBOL(match_int); -EXPORT_SYMBOL(match_octal); -EXPORT_SYMBOL(match_hex); -EXPORT_SYMBOL(match_wildcard); -EXPORT_SYMBOL(match_strlcpy); EXPORT_SYMBOL(match_strdup); -- cgit v1.2.3-18-g5258 From 578b1e0701af34f9ef69daabda4431f1e8501109 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 23 Jan 2014 15:54:14 -0800 Subject: dynamic_debug: add wildcard support to filter files/functions/modules Add wildcard '*'(matches zero or more characters) and '?' (matches one character) support when qurying debug flags. Now we can open debug messages using keywords. eg: 1. open debug logs in all usb drivers echo "file drivers/usb/* +p" > /dynamic_debug/control 2. open debug logs for usb xhci code echo "file *xhci* +p" > /dynamic_debug/control Signed-off-by: Du, Changbin Cc: Jason Baron Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index c37aeacd765..600ac57e277 100644 --- a/lib/dynamic_debug.c +